def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ paths = self.process_samples(itr, paths) epoch = itr / self.steps_per_epoch self.episode_rewards.extend(paths['undiscounted_returns']) last_average_return = np.mean(self.episode_rewards) for _ in range(self.n_train_steps): if (self.replay_buffer.n_transitions_stored >= self.min_buffer_size): self.evaluate = True qf_loss = self.optimize_policy(epoch, None) self.episode_qf_losses.append(qf_loss) if self.evaluate: if itr % self.target_network_update_freq == 0: self._qf_update_ops() if itr % self.steps_per_epoch == 0: if self.evaluate: mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]), 1) mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:]) tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self.episode_rewards)) tabular.record('StdReturn', np.std(self.episode_rewards)) tabular.record('Episode100RewardMean', mean100ep_rewards) tabular.record('{}/Episode100LossMean'.format(self.qf.name), mean100ep_qf_loss) return last_average_return
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() last_returns = [float('nan')] trainer.enable_logging = False qf_losses = [] for _ in trainer.step_epochs(): for cycle in range(self._steps_per_epoch): trainer.step_episode = trainer.obtain_episodes( trainer.step_itr) if hasattr(self.exploration_policy, 'update'): self.exploration_policy.update(trainer.step_episode) qf_losses.extend( self._train_once(trainer.step_itr, trainer.step_episode)) if (cycle == 0 and self._replay_buffer.n_transitions_stored >= self._min_buffer_size): trainer.enable_logging = True eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(trainer.step_itr, eval_episodes, discount=self._discount) trainer.step_itr += 1 tabular.record('DQN/QFLossMean', np.mean(qf_losses)) tabular.record('DQN/QFLossStd', np.std(qf_losses)) return np.mean(last_returns)
def _fit_baseline(self, samples_data): """Update baselines from samples.""" policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) # Fit baseline logger.log('Fitting baseline...') if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ for _ in runner.step_epochs(): if self.replay_buffer.n_transitions_stored < self.min_buffer_size: batch_size = self.min_buffer_size else: batch_size = None runner.step_path = runner.obtain_samples(runner.step_itr, batch_size) for sample in runner.step_path: self.replay_buffer.store(obs=sample.observation, act=sample.action, rew=sample.reward, next_obs=sample.next_observation, done=sample.terminal) self.episode_rewards.append(sum([sample.reward for sample in runner.step_path])) for _ in range(self.gradient_steps): last_return, policy_loss, qf1_loss, qf2_loss = self.train_once(runner.step_itr, runner.step_path) log_performance( runner.step_itr, self._obtain_evaluation_samples(runner.get_env_copy(), num_trajs=10), discount=self.discount) self.log_statistics(policy_loss, qf1_loss, qf2_loss) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return last_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for _ in range(self.n_samples): runner.step_path = runner.obtain_samples(runner.step_itr) tabular.record('TotalEnvSteps', runner.total_env_steps) last_return = self.train_once(runner.step_itr, runner.step_path) runner.step_itr += 1 return last_return
def _train_once(self): """Perform one iteration of training.""" policy_loss_list = [] qf_loss_list = [] alpha_loss_list = [] alpha_list = [] for _ in range(self._num_steps_per_epoch): indices = np.random.choice(range(self._num_train_tasks), self._meta_batch_size) policy_loss, qf_loss, alpha_loss, alpha = self._optimize_policy( indices) policy_loss_list.append(policy_loss) qf_loss_list.append(qf_loss) alpha_loss_list.append(alpha_loss) alpha_list.append(alpha) with tabular.prefix('MetaTrain/Average/'): tabular.record('PolicyLoss', np.average(np.array(policy_loss_list))) tabular.record('QfLoss', np.average(np.array(qf_loss_list))) tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list))) tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list))) tabular.record('Alpha', np.average(np.array(alpha_list)))
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ paths = self.process_samples(itr, paths) epoch = itr // self.n_samples i_sample = itr - epoch * self.n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = paths['average_return'] self._all_returns.append(paths['average_return']) if (itr + 1) % self.n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self.n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def step(self, action): """ Run one timestep of the environment's dynamics. When end of episode is reached, reset() should be called to reset the environment's internal state. Input ----- action : an action provided by the policy, here a combination of f and l1, l2, ... Outputs ------- (observation, reward, done, info) observation : agent's observation of the current environment reward [Float] : amount of reward due to the previous action done : a boolean, indicating whether the episode has ended info : a dictionary containing other diagnostic information from the previous action """ self.step_cnt += 1 action = np.clip(action.copy(), self.action_space.low, self.action_space.high) f = action[0] # input force l = np.sum(action[1:]) # bar length f_total = f + (self.m1 + self.m2) * self.g - self.k * self.y1 a = f_total / (self.m1 + self.m2) self.y1, self.v1 = self.simulate_w_mid_point_euler(self.y1, self.v1, a) y2 = self.y1 + l obs = np.array([self.y1, self.v1]) reward = self.calc_reward(y2, f, self.v1) done = False info = {} if self.step_cnt == self.n_steps_per_episode: print() print('y2: ', y2) print('v2: ', self.v1) print('l: ', l) tabular.record('Env/FinalL', l) return obs, reward, done, info
def fit(self, paths): """Fit regressor based on paths. Args: paths (dict[numpy.ndarray]): Sample paths. """ xs = np.concatenate([p['observations'] for p in paths]) if isinstance(self._env_spec.observation_space, akro.Image) and \ len(xs[0].shape) < \ len(self._env_spec.observation_space.shape): xs = self._env_spec.observation_space.unflatten_n(xs) ys = np.concatenate([p['returns'] for p in paths]) ys = ys.reshape((-1, 1)) if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean.load(np.mean(xs, axis=0, keepdims=True)) self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8) self._old_network.x_mean.load(np.mean(xs, axis=0, keepdims=True)) self._old_network.x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) if self._normalize_outputs: # recompute normalizing constants for outputs self._y_mean.load(np.mean(ys, axis=0, keepdims=True)) self._y_std.load(np.std(ys, axis=0, keepdims=True) + 1e-8) self._old_network.y_mean.load(np.mean(ys, axis=0, keepdims=True)) self._old_network.y_std.load( np.std(ys, axis=0, keepdims=True) + 1e-8) inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) if self._use_trust_region: tabular.record('{}/MeanKL'.format(self._name), self._optimizer.constraint_val(inputs)) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after) self._old_model.parameters = self.parameters
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def fit(self, xs, ys): """Fit with input data xs and label ys. Args: xs (numpy.ndarray): Input data. ys (numpy.ndarray): Label of input data. """ if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self.model.networks['default'].x_mean.load( np.mean(xs, axis=0, keepdims=True)) self.model.networks['default'].x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) self._old_model.networks['default'].x_mean.load( np.mean(xs, axis=0, keepdims=True)) self._old_model.networks['default'].x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) if self._normalize_outputs: # recompute normalizing constants for outputs self.model.networks['default'].y_mean.load( np.mean(ys, axis=0, keepdims=True)) self.model.networks['default'].y_std.load( np.std(ys, axis=0, keepdims=True) + 1e-8) self._old_model.networks['default'].y_mean.load( np.mean(ys, axis=0, keepdims=True)) self._old_model.networks['default'].y_std.load( np.std(ys, axis=0, keepdims=True) + 1e-8) inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) if self._use_trust_region: tabular.record('{}/MeanKL'.format(self._name), self._optimizer.constraint_val(inputs)) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after) self._old_model.parameters = self.model.parameters
def train_once(self, itr, paths, obs_upper, obs_lower, action_upper, action_lower, evaluate_paths): """Train the algorithm once.""" paths = self.process_samples(itr, paths) evaluate_paths = self.process_samples(itr, evaluate_paths) epoch = itr / self.n_epoch_cycles self.episode_rewards.extend(evaluate_paths['undiscounted_returns']) last_average_return = np.mean(self.episode_rewards) for train_itr in range(self.n_train_steps): if (self.replay_buffer.n_transitions_stored >= self.min_buffer_size): self.evaluate = True qf_loss = self.optimize_policy(epoch, None) self.episode_qf_losses.append(qf_loss) if self.evaluate: if itr % self.target_network_update_freq == 0: self._qf_update_ops() if itr % self.n_epoch_cycles == 0: if self.evaluate: mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]), 1) mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:]) tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self.episode_rewards)) tabular.record('StdReturn', np.std(self.episode_rewards)) tabular.record('Episode100RewardMean', mean100ep_rewards) tabular.record('{}/Episode100LossMean'.format(self.qf.name), mean100ep_qf_loss) if not self.smooth_return: self.episode_rewards = [] return last_average_return
def _train_irl(self, paths, itr=0): if self.no_reward: total_rew = 0. for path in paths: total_rew += np.sum(path['rewards']) path['rewards'] *= 0 tabular.record('OriginalTaskAverageReturn', total_rew / float(len(paths))) if self.irl_model_wt <= 0: return paths max_iters = self.discrim_train_itrs mean_loss = self._irl.train(paths) tabular.record('IRLLoss', mean_loss) self.irl_params = self._irl.get_params() estimated_rewards = self._irl.eval(paths, gamma=self.discount, itr=itr) tabular.record('IRLRewardMean', np.mean(np.concatenate(estimated_rewards))) tabular.record('IRLRewardMean', np.max(np.concatenate(estimated_rewards))) tabular.record('IRLRewardMean', np.min(np.concatenate(estimated_rewards))) # Replace the original reward signal with learned reward signal # This will be used by agents to learn policy if self._irl.score_trajectories: for i, path in enumerate(paths): path['rewards'][-1] += self.irl_model_wt * estimated_rewards[i] else: for i, path in enumerate(paths): path['rewards'] += self.irl_model_wt * estimated_rewards[i] return paths
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False qf_losses = [] for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_trajectories(runner.step_itr) qf_losses.extend( self.train_once(runner.step_itr, runner.step_path)) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_samples = obtain_evaluation_samples( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_samples, discount=self._discount) runner.step_itr += 1 tabular.record('DQN/QFLossMean', np.mean(qf_losses)) tabular.record('DQN/QFLossStd', np.std(qf_losses)) return np.mean(last_returns)
def fit(self, xs, ys): if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.compat.v1.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs feed_dict = { self._x_mean_var_ph: np.mean(xs, axis=0, keepdims=True), self._x_std_var_ph: np.std(xs, axis=0, keepdims=True) + 1e-8, } sess.run([ self._assign_x_mean, self._assign_x_std, ], feed_dict=feed_dict) # yapf: disable if self._normalize_outputs: # recompute normalizing constants for outputs feed_dict = { self._y_mean_var_ph: np.mean(ys, axis=0, keepdims=True), self._y_std_var_ph: np.std(ys, axis=0, keepdims=True) + 1e-8, } sess.run([self._assign_y_mean, self._assign_y_std], feed_dict=feed_dict) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + '/' else: prefix = '' tabular.record(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) if self._use_trust_region: tabular.record(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def fit(self, xs, ys): """Optimize the regressor based on the inputs.""" if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs sess.run([ tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)), tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8), ]) if self._normalize_outputs: # recompute normalizing constants for outputs sess.run([ tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)), tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8), ]) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + '/' else: prefix = '' tabular.record(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) if self._use_trust_region: tabular.record(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def extra_recording(self, itr): """Record extra training statistics per-iteration. Parameters ---------- itr : int The iteration number. """ tabular.record('Max Divergence', np.max(self.divergences)) tabular.record('Min Divergence', np.min(self.divergences)) tabular.record('Mean Divergence', np.mean(self.divergences)) return None
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (garage.EpisodeBatch): Episodes collected using the current policy. Returns: float: The average return of epoch cycle. """ # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) average_return = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = average_return self._all_returns.append(average_return) # -- Stage: Update policy distribution. if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) best_inds = np.argsort(-avg_rtns)[:self._n_best] best_params = np.array(self._all_params)[best_inds] # MLE of normal distribution self._cur_mean = best_params.mean(axis=0) self._cur_std = best_params.std(axis=0) self.policy.set_param_values(self._cur_mean) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params.clear() # -- Stage: Generate a new policy for next path sampling self._cur_params = self._sample_params(itr) self._all_params.append(self._cur_params.copy()) self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def fit(self, xs, ys): """Fit with input data xs and label ys.""" if self._normalize_inputs: # recompute normalizing constants for inputs self.model.networks['default'].x_mean.load( np.mean(xs, axis=0, keepdims=True)) self.model.networks['default'].x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (garage.EpisodeBatch): Episodes collected using the current policy. Returns: float: The average return of epoch cycle. """ # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) average_return = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = average_return self._all_returns.append(average_return) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.compat.v1.get_default_session().run( tf.group( tf.compat.v1.assign(self.x_mean_var, new_mean), tf.compat.v1.assign(self.x_std_var, new_std), )) inputs = [xs, ys] loss_before = self.optimizer.loss(inputs) if self.name: prefix = self.name + '/' else: prefix = '' tabular.record(prefix + 'LossBefore', loss_before) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) tabular.record(prefix + 'LossAfter', loss_after) tabular.record(prefix + 'dLoss', loss_before - loss_after)
def fit(self, paths): """Fit regressor based on paths. Args: paths (dict[numpy.ndarray]): Sample paths. """ xs = np.concatenate([p['observations'] for p in paths]) ys = np.concatenate([p['returns'] for p in paths]) ys = ys.reshape((-1, 1)) if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean.load(np.mean(xs, axis=0, keepdims=True)) self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8) inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after)
def train_once(self, itr, trajectories): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. trajectories (TrajectoryBatch): Batch of trajectories. Returns: numpy.float64: Average return. """ epoch = itr / self._steps_per_epoch self.episode_rewards.extend( [traj.rewards.sum() for traj in trajectories.split()]) last_average_return = np.mean(self.episode_rewards) for _ in range(self._n_train_steps): if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): qf_loss = self.optimize_policy(None) self.episode_qf_losses.append(qf_loss) if self.replay_buffer.n_transitions_stored >= self._min_buffer_size: if itr % self._target_network_update_freq == 0: self._qf_update_ops() if itr % self._steps_per_epoch == 0: if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): mean100ep_rewards = round(np.mean(self.episode_rewards[-100:]), 1) mean100ep_qf_loss = np.mean(self.episode_qf_losses[-100:]) tabular.record('Epoch', epoch) tabular.record('Episode100RewardMean', mean100ep_rewards) tabular.record('{}/Episode100LossMean'.format(self._qf.name), mean100ep_qf_loss) return last_average_return
def optimize_policy(self, samples_data): """Optimize the policy using the samples. Args: samples_data (dict): Processed sample data. See metarl.tf.paths_to_tensors() for details. """ # Initial BFGS parameter values. x0 = np.hstack([self._param_eta, self._param_v]) # Set parameter boundaries: \eta>=1e-12, v unrestricted. bounds = [(-np.inf, np.inf) for _ in x0] bounds[0] = (1e-12, np.inf) # Optimize dual eta_before = self._param_eta logger.log('Computing dual before') self._feat_diff = self._features(samples_data) dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_before = self._f_dual(*dual_opt_input_values) logger.log('Optimizing dual') def eval_dual(x): """Evaluate dual function loss. Args: x (numpy.ndarray): Input to dual function. Returns: numpy.float64: Dual function loss. """ self._param_eta = x[0] self._param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) return self._f_dual(*dual_opt_input_values) def eval_dual_grad(x): """Evaluate gradient of dual function loss. Args: x (numpy.ndarray): Input to dual function. Returns: numpy.ndarray: Gradient of dual function loss. """ self._param_eta = x[0] self._param_v = x[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) grad = self._f_dual_grad(*dual_opt_input_values) eta_grad = np.float(grad[0]) v_grad = grad[1] return np.hstack([eta_grad, v_grad]) params_ast, _, _ = self._dual_optimizer(func=eval_dual, x0=x0, fprime=eval_dual_grad, bounds=bounds, **self._dual_optimizer_args) logger.log('Computing dual after') self._param_eta, self._param_v = params_ast[0], params_ast[1:] dual_opt_input_values = self._dual_opt_input_values(samples_data) dual_after = self._f_dual(*dual_opt_input_values) # Optimize policy policy_opt_input_values = self._policy_opt_input_values(samples_data) logger.log('Computing policy loss before') loss_before = self._optimizer.loss(policy_opt_input_values) logger.log('Computing policy KL before') policy_kl_before = self._f_policy_kl(*policy_opt_input_values) logger.log('Optimizing policy') self._optimizer.optimize(policy_opt_input_values) logger.log('Computing policy KL') policy_kl = self._f_policy_kl(*policy_opt_input_values) logger.log('Computing policy loss after') loss_after = self._optimizer.loss(policy_opt_input_values) tabular.record('EtaBefore', eta_before) tabular.record('EtaAfter', self._param_eta) tabular.record('DualBefore', dual_before) tabular.record('DualAfter', dual_after) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) self._old_policy.model.parameters = self.policy.model.parameters
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. """ self._replay_buffer.add_episode_batch(episodes) epoch = itr / self._steps_per_epoch for _ in range(self._n_train_steps): if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): qf_loss, y_s, qval, policy_loss = self._optimize_policy(itr) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y_s) self._epoch_qs.append(qval) if itr % self._steps_per_epoch == 0: logger.log('Training finished') if (self._replay_buffer.n_transitions_stored >= self._min_buffer_size): tabular.record('Epoch', epoch) tabular.record('Policy/AveragePolicyLoss', np.mean(self._episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self._episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self._epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self._epoch_ys)))
def train_once(self, itr, paths): """Perform one iteration of training. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: float: Average return. """ paths = self.process_samples(itr, paths) epoch = itr / self.steps_per_epoch self._episode_rewards.extend([ path for path, complete in zip(paths['undiscounted_returns'], paths['complete']) if complete ]) self._success_history.extend([ path for path, complete in zip(paths['success_history'], paths['complete']) if complete ]) last_average_return = np.mean(self._episode_rewards) for _ in range(self.n_train_steps): if (self.replay_buffer.n_transitions_stored >= self.min_buffer_size): self._evaluate = True samples = self.replay_buffer.sample(self.buffer_batch_size) qf_loss, y, q, policy_loss = tu.torch_to_np( self.optimize_policy(itr, samples)) self._episode_policy_losses.append(policy_loss) self._episode_qf_losses.append(qf_loss) self._epoch_ys.append(y) self._epoch_qs.append(q) if itr % self.steps_per_epoch == 0: logger.log('Training finished') if self._evaluate: tabular.record('Epoch', epoch) tabular.record('AverageReturn', np.mean(self._episode_rewards)) tabular.record('StdReturn', np.std(self._episode_rewards)) tabular.record('Policy/AveragePolicyLoss', np.mean(self._episode_policy_losses)) tabular.record('QFunction/AverageQFunctionLoss', np.mean(self._episode_qf_losses)) tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) tabular.record('QFunction/AverageAbsQ', np.mean(np.abs(self._epoch_qs))) tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) tabular.record('QFunction/AverageAbsY', np.mean(np.abs(self._epoch_ys))) tabular.record('AverageSuccessRate', np.mean(self._success_history)) if not self.smooth_return: self._episode_rewards = [] self._episode_policy_losses = [] self._episode_qf_losses = [] self._epoch_ys = [] self._epoch_qs = [] self._success_history.clear() return last_average_return
def log_multitask_performance(itr, batch, discount, name_map=None): r"""Log performance of trajectories from multiple tasks. Args: itr (int): Iteration number to be logged. batch (garage.TrajectoryBatch): Batch of trajectories. The trajectories should have either the "task_name" or "task_id" `env_infos`. If the "task_name" is not present, then `name_map` is required, and should map from task id's to task names. discount (float): Discount used in computing returns. name_map (dict[int, str] or None): Mapping from task id's to task names. Optional if the "task_name" environment info is present. Note that if provided, all tasks listed in this map will be logged, even if there are no trajectories present for them. Returns: numpy.ndarray: Undiscounted returns averaged across all tasks. Has shape :math:`(N \bullet [T])`. """ traj_by_name = defaultdict(list) for trajectory in batch.split(): task_name = '__unnamed_task__' if 'task_name' in trajectory.env_infos: task_name = trajectory.env_infos['task_name'][0] elif 'task_id' in trajectory.env_infos: name_map = {} if name_map is None else name_map task_id = trajectory.env_infos['task_id'][0] task_name = name_map.get(task_id, 'Task #{}'.format(task_id)) traj_by_name[task_name].append(trajectory) if name_map is None: task_names = traj_by_name.keys() else: task_names = name_map.values() for task_name in task_names: if task_name in traj_by_name: trajectories = traj_by_name[task_name] log_performance(itr, garage.TrajectoryBatch.concatenate(*trajectories), discount, prefix=task_name) else: with tabular.prefix(task_name + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', 0) tabular.record('AverageDiscountedReturn', np.nan) tabular.record('AverageReturn', np.nan) tabular.record('StdReturn', np.nan) tabular.record('MaxReturn', np.nan) tabular.record('MinReturn', np.nan) tabular.record('TerminationRate', np.nan) tabular.record('SuccessRate', np.nan) return log_performance(itr, batch, discount=discount, prefix='Average')
def log_performance(itr, batch, discount, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of trajectories. Args: itr (int): Iteration number. batch (TrajectoryBatch): The trajectories to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] termination = [] success = [] for trajectory in batch.split(): returns.append(discount_cumsum(trajectory.rewards, discount)) undiscounted_returns.append(sum(trajectory.rewards)) termination.append( float( any(step_type == StepType.TERMINAL for step_type in trajectory.step_types))) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('TerminationRate', np.mean(termination)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = special.discount_cumsum(path['rewards'], self.discount) returns.append(path['returns']) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict(average_return=np.mean(undiscounted_returns)) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data