def _train(self): config = self.config step_tstart = time.time() theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_deltas"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes self.timesteps_so_far += num_timesteps # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if deltas_used < num_deltas max_rewards = np.max(noisy_returns, axis=1) if self.deltas_used > self.num_deltas: self.deltas_used = self.num_deltas percentile = 100 * (1 - (self.deltas_used / self.num_deltas)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) print('the number of policy params is, ', self.policy.num_params) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) step_tend = time.time() tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) tlogger.record_tabular("NoisyEpRewMean", noisy_returns.mean()) tlogger.record_tabular("NoisyEpRewStd", noisy_returns.std()) tlogger.record_tabular("NoisyEpLenMean", noisy_lengths.mean()) tlogger.record_tabular("WeightsNorm", float(np.square(theta).sum())) tlogger.record_tabular("WeightsStd", float(np.std(theta))) tlogger.record_tabular("Grad2Norm", float(np.sqrt(np.square(g).sum()))) tlogger.record_tabular("UpdateRatio", float(update_ratio)) tlogger.dump_tabular() info = { "weights_norm": np.square(theta).sum(), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, "timesteps_so_far": self.timesteps_so_far, "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } result = dict( episode_reward_mean=eval_returns.mean(), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): config = self.config theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize( {DEFAULT_POLICY_ID: self.policy.get_filter()}, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict(episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): config = self.config theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize({ "default": self.policy.get_filter() }, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict( episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result
def _train(self): config = self.config # Here the iteration starts # Create the random environments configurations for each iteration logger.info("Creating random environment configurations") # ceil(config["num_rollouts"]/config["num_workers"]*2)*config["num_workers"] # is the exact number of environments created per iteration as the iteration is incremently # increase by 2 (one for positive and one for negative perturbation) for each worker # For each positive and negative perturbation, the same environment # But if we do this it will be corrolated with number of rollouts and we will have less number of rollouts with different perturbations random_env_config_id = create_random_env_configs.remote(self.extra_config["num_randomized_envs"], self.domain_randomization_config) self.random_env_config = SharedRandomEnvConfigsTable(ray.get(random_env_config_id)) for worker in self.workers: worker.setRandomEnvConfig.remote(random_env_config_id) theta = self.policy.get_weights() assert theta.dtype == np.float32 # Put the current policy weights in the object store. theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] all_training_lengths = [] all_eval_returns = [] all_eval_lengths = [] # Loop over the results. for result in results: all_eval_returns += result.eval_returns all_eval_lengths += result.eval_lengths all_noise_indices += result.noise_indices all_training_returns += result.noisy_returns all_training_lengths += result.noisy_lengths assert len(all_eval_returns) == len(all_eval_lengths) assert (len(all_noise_indices) == len(all_training_returns) == len(all_training_lengths)) self.episodes_so_far += num_episodes # Assemble the results. eval_returns = np.array(all_eval_returns) eval_lengths = np.array(all_eval_lengths) noise_indices = np.array(all_noise_indices) noisy_returns = np.array(all_training_returns) noisy_lengths = np.array(all_training_lengths) # keep only the best returns # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) if self.rollouts_used > self.num_rollouts: self.rollouts_used = self.num_rollouts percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] noisy_returns = noisy_returns[idx, :] # Compute and take a step. It means take a step in changing the theta not take an action g, count = utils.batched_weighted_sum( noisy_returns[:, 0] - noisy_returns[:, 1], (self.noise.get(index, self.policy.num_params) for index in noise_idx), batch_size=min(500, noisy_returns[:, 0].size)) g /= noise_idx.size # scale the returns by their standard deviation if not np.isclose(np.std(noisy_returns), 0.0): g /= np.std(noisy_returns) assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32) # Compute the new weights theta. theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) # update the reward list if len(all_eval_returns) > 0: self.reward_list.append(eval_returns.mean()) # Now sync the filters FilterManager.synchronize({ DEFAULT_POLICY_ID: self.policy.get_filter() }, self.workers) info = { "weights_norm": np.square(theta).sum(), "weights_std": np.std(theta), "grad_norm": np.square(g).sum(), "update_ratio": update_ratio, "episodes_this_iter": noisy_lengths.size, "episodes_so_far": self.episodes_so_far, } result = dict( episode_reward_mean=np.mean( self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) return result