def log_diagnostics(self, paths, *args, **kwargs): # we call here any logging related to the maze, strip the maze # obs and call log_diag with the stripped paths we need to log # the purely gather reward!! with logger.tabular_prefix('Maze_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = stripped_path[ 'observations'][:, :flat_dim(self.env.observation_space)] # this breaks if the obs of the robot are d>1 dimensional (not a # vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.env.log_diagnostics(stripped_paths, *args, **kwargs)
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker(sess) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): params = self.optimize_policy(itr, ) if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") logger.log("Saving snapshot...") logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('IterTime', time.time() - itr_start_time) logger.record_tabular('Time', time.time() - start_time) logger.dump_tabular() self.shutdown_worker() if created_session: sess.close()
def log_env_info(self, env_infos, prefix=""): # Logging rewards rew_dic = env_infos["rewards"] for key in rew_dic.keys(): rew_sums = np.sum(rew_dic[key], axis=1) logger.record_tabular("rewards/" + key + "_avg", np.mean(rew_sums)) logger.record_tabular("rewards/" + key + "_std", np.std(rew_sums))
def log_diagnostics(self, paths): self.policy.log_diagnostics(paths) self.baseline.log_diagnostics(paths) path_lengths = [path["returns"].size for path in paths] logger.record_tabular('ep_len_avg', np.mean(path_lengths)) logger.record_tabular('ep_len_std', np.std(path_lengths))
def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): # we call here any logging related to the gather, strip the maze obs # and call log_diag with the stripped paths we need to log the purely # gather reward!! with logger.tabular_prefix(log_prefix + '_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = \ stripped_path['observations'][ :, :flat_dim(self.wrapped_env.observation_space)] # this breaks if the obs of the robot are d>1 dimensional (not a # vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): if 'env_infos' in paths[0].keys( ) and 'inner_rew' in paths[0]['env_infos'].keys(): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics( stripped_paths ) # see swimmer_env.py for a scketch of the maze plotting!
def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def train_once(self, itr, paths): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, paths) logger.record_tabular('IterTime', time.time() - itr_start_time) logger.dump_tabular()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for( range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training( iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def _train(self, env, policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training pool (`PoolBase`): Sample pool to add samples to """ self._init_training() self.sampler.initialize(env, policy, pool) evaluation_env = deep_clone(env) if self._eval_n_episodes else None with tf_utils.get_default_session().as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(policy, evaluation_env) gt.stamp('eval') params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) time_itrs = gt.get_times().stamps.itrs time_eval = time_itrs['eval'][-1] time_total = gt.get_times().total time_train = time_itrs.get('train', [0])[-1] time_sample = time_itrs.get('sample', [0])[-1] logger.record_tabular('time-train', time_train) logger.record_tabular('time-eval', time_eval) logger.record_tabular('time-sample', time_sample) logger.record_tabular('time-total', time_total) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix()
def log_diagnostics(self, paths): n_goal = len(self.goal_positions) goal_reached = [False] * n_goal for path in paths: last_obs = path["observations"][-1] for i, goal in enumerate(self.goal_positions): if np.linalg.norm(last_obs - goal) < self.goal_threshold: goal_reached[i] = True logger.record_tabular('env:goals_reached', goal_reached.count(True))
def train(self, sess=None): address = ("localhost", 6000) conn = Client(address) last_average_return = None try: created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) conn.send(ExpLifecycle.START) self.start_worker(sess) start_time = time.time() for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") conn.send(ExpLifecycle.OBTAIN_SAMPLES) paths = self.obtain_samples(itr) logger.log("Processing samples...") conn.send(ExpLifecycle.PROCESS_SAMPLES) samples_data = self.process_samples(itr, paths) last_average_return = samples_data["average_return"] logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") conn.send(ExpLifecycle.OPTIMIZE_POLICY) self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: conn.send(ExpLifecycle.UPDATE_PLOT) self.plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") conn.send(ExpLifecycle.SHUTDOWN) self.shutdown_worker() if created_session: sess.close() finally: conn.close() return last_average_return
def train_inference_network(self, inference_opt_input_values): """ Optimize inference network """ logger.log("Optimizing inference network...") infer_loss_before = self.inference_optimizer.loss( inference_opt_input_values) logger.record_tabular('Inference/Loss', infer_loss_before) self.inference_optimizer.optimize(inference_opt_input_values) infer_loss_after = self.inference_optimizer.loss( inference_opt_input_values) logger.record_tabular('Inference/dLoss', infer_loss_before - infer_loss_after) return infer_loss_after
def outer_optimize(self, samples_data): logger.log("optimizing policy") observations = ext.extract(samples_data, "observations") actions = ext.extract(samples_data, "actions") advantages = ext.extract(samples_data, "advantages") num_traj = len(samples_data["paths"]) observations = observations[0].reshape( -1, self.env.spec.observation_space.shape[0]) actions = actions[0].reshape(-1, self.env.spec.action_space.shape[0]) advantages = advantages[0].reshape(-1) inputs = tuple([observations, actions, advantages]) s_g = self._opt_fun["f_train"](*(list(inputs))) #s_g = [x / num_traj for x in s_g] self.gradient_backup = copy.deepcopy(s_g) g_flat = self.flatten_parameters(s_g) loss_before = self._opt_fun["f_loss"](*(list(inputs))) self.backup_policy.set_param_values( self.policy.get_param_values(trainable=True), trainable=True) self.optimizer.optimize(inputs, g_flat) loss_after = self._opt_fun["f_loss"](*(list(inputs))) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self._opt_fun['f_kl'](*(list(inputs))) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize(self, inputs, extra_inputs=None): if not inputs: # Assumes that we should always sample mini-batches raise NotImplementedError f_loss = self._opt_fun["f_loss"] f_grad = self._opt_fun["f_grad"] f_grad_tilde = self._opt_fun["f_grad_tilde"] inputs = tuple(inputs) if extra_inputs is None: extra_inputs = tuple() else: extra_inputs = tuple(extra_inputs) param = np.copy(self._target.get_param_values(trainable=True)) logger.log("Start SVRPG optimization: #parameters: %d, #inputs %d" % (len(param), len(inputs[0]))) dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) start_time = time.time() for epoch in range(self._max_epochs): if self._verbose: logger.log("Epoch %d" % (epoch)) progbar = pyprind.ProgBar(len(inputs[0])) grad_sum = np.zeros_like(param) g_mean_tilde = f_grad_tilde(inputs, extra_inputs) logger.record_tabular('g_mean_tilde', LA.norm(g_mean_tilde)) print("-------------mini-batch-------------------") num_batch = 0 while num_batch < self._max_batch: batch = dataset.random_batch() g = f_grad(*(batch)) - f_grad_tilde(*(batch)) + g_mean_tilde grad_sum += g prev_w = np.copy(self._target.get_param_values(trainable=True)) step = self._alpha * g cur_w = prev_w + step self._target.set_param_value(cur_w, trainable=True) num_batch += 1 print("max batch achieved {:}".format(num_batch)) grad_sum /= 1.0 * num_batch logger.record_tabular('gdist', LA.norm(grad_sum - g_mean_tilde)) cur_w = np.copy(self._target.get_param_values(trainable=True)) w_tilde = self._target_tilde.get_params_values(trainable=True) self._target_tilde.set_param_values(cur_w, trainable=True) logger.record_tabular('wnorm', LA.norm(cur_w)) logger.record_tabular('w_dist', LA.norm(cur_w - w_tilde) / LA.norm(cur_w)) if self._verbose: if progbar.active: progbar.stop() if abs(LA.norm(cur_w - w_tilde) / LA.norm(cur_w)) < self._tolerance: break
def log_diagnostics(self, paths): progs = [ path["observations"][-1][-3] - path["observations"][0][-3] for path in paths ] logger.record_tabular('AverageForwardProgress', np.mean(progs)) logger.record_tabular('MaxForwardProgress', np.max(progs)) logger.record_tabular('MinForwardProgress', np.min(progs)) logger.record_tabular('StdForwardProgress', np.std(progs))
def fit(self, xs, ys): if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean_var.set_value( np.mean(xs, axis=0, keepdims=True).astype(theano.config.floatX)) self._x_std_var.set_value( (np.std(xs, axis=0, keepdims=True) + 1e-8).astype( theano.config.floatX)) if self._normalize_outputs: # recompute normalizing constants for outputs self._y_mean_var.set_value( np.mean(ys, axis=0, keepdims=True).astype(theano.config.floatX)) self._y_std_var.set_value( (np.std(ys, axis=0, keepdims=True) + 1e-8).astype( theano.config.floatX)) if self._name: prefix = self._name + "_" else: prefix = "" # FIXME: needs batch computation to avoid OOM. loss_before, loss_after, mean_kl, batch_count = 0., 0., 0., 0 for batch in iterate_minibatches_generic( input_lst=[xs, ys], batchsize=self._batchsize, shuffle=True): batch_count += 1 xs, ys = batch if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before += self._optimizer.loss(inputs) self._optimizer.optimize(inputs) loss_after += self._optimizer.loss(inputs) if self._use_trust_region: mean_kl += self._optimizer.constraint_val(inputs) logger.record_tabular(prefix + 'LossBefore', loss_before / batch_count) logger.record_tabular(prefix + 'LossAfter', loss_after / batch_count) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after / batch_count) if self._use_trust_region: logger.record_tabular(prefix + 'MeanKL', mean_kl / batch_count)
def _training_step(self, itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): self._sampling() self._bookkeeping() self._memory_selection(itr) self._policy_optimization(itr) if itr % self.evaluation_interval == 0: self._policy_evaluation() self._log_diagnostics(itr) logger.record_tabular('Time', time.time() - self.start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False)
def train(self, sess=None): created_session = True if (sess is None) else False if sess is None: sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) self.start_worker(sess) start_time = time.time() last_average_return = None for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining samples...") paths = self.obtain_samples(itr) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) last_average_return = samples_data["average_return"] logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.optimize_policy(itr, samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: self.plotter.update_plot(self.policy, self.max_path_length) if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker() if created_session: sess.close() return last_average_return
def _fit_baseline(self, samples_data): """ Update baselines from samples. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor) paths = samples_data["paths"] valids = samples_data["valids"] baselines = [path["baselines"] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path["rewards"] = rew[val.astype(np.bool)] path["returns"] = ret[val.astype(np.bool)] aug_rewards.append(path["rewards"]) aug_returns.append(path["returns"]) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data["rewards"] = aug_rewards samples_data["returns"] = aug_returns # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) logger.record_tabular( "{}/ExplainedVariance".format(self.baseline.name), ev) # Fit baseline logger.log("Fitting baseline...") if hasattr(self.baseline, "fit_with_samples"): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def train(self): with tf.Session() as sess: sess.run(tf.initialize_all_variables()) self.start_worker(sess) start_time = time.time() self.num_samples = 0 for itr in range(self.start_itr, self.n_itr): itr_start_time = time.time() with logger.prefix('itr #%d | ' % itr): logger.log("Obtaining new samples...") paths = self.obtain_samples(itr) for path in paths: self.num_samples += len(path["rewards"]) logger.log("total num samples..." + str(self.num_samples)) logger.log("Processing samples...") samples_data = self.process_samples(itr, paths) logger.log("Logging diagnostics...") self.log_diagnostics(paths) logger.log("Optimizing policy...") self.outer_optimize(samples_data) for sub_itr in range(self.n_sub_itr): logger.log("Minibatch Optimizing...") self.inner_optimize(samples_data) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) # , **kwargs) if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) #if self.plot: # self.update_plot() # if self.pause_for_plot: # input("Plotting evaluation run: Press Enter to " # "continue...") self.shutdown_worker()
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run( tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) # self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True)) # self._x_std_var.set_value( # np.std(xs, axis=0, keepdims=True) + 1e-8) if self.use_trust_region and self.first_optimized: old_p = self.f_p(xs) inputs = [xs, ys, old_p] optimizer = self.tr_optimizer else: inputs = [xs, ys] optimizer = self.optimizer loss_before = optimizer.loss(inputs) if self.name: prefix = self.name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) optimizer.optimize(inputs) loss_after = optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after) self.first_optimized = True
def fit(self, xs, ys): """Optimize the regressor based on the inputs.""" if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs sess.run([ tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)), tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8), ]) if self._normalize_outputs: # recompute normalizing constants for outputs sess.run([ tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)), tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8), ]) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "/" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) if self._use_trust_region: logger.record_tabular(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl']( *(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def fit(self, xs, ys): if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean_var.set_value(np.mean(xs, axis=0, keepdims=True)) self._x_std_var.set_value(np.std(xs, axis=0, keepdims=True) + 1e-8) if self._use_trust_region: old_prob = self._f_prob(xs) inputs = [xs, ys, old_prob] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def log_diagnostics(self, batch): """Record diagnostic information. Records the mean and standard deviation of Q-function and the squared Bellman residual of the s (mean squared Bellman error) for a sample batch. Also call the `draw` method of the plotter, if plotter is defined. """ feeds = self._get_feed_dict(batch) qf, bellman_residual = self._sess.run( [self._q_values, self._bellman_residual], feeds) logger.record_tabular('qf-avg', np.mean(qf)) logger.record_tabular('qf-std', np.std(qf)) logger.record_tabular('mean-sq-bellman-error', bellman_residual) self.policy.log_diagnostics(batch) if self.plotter: self.plotter.draw()
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run( tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) inputs = [xs, ys] loss_before = self.optimizer.loss(inputs) if self.name: prefix = self.name + "/" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if self.es_path_returns: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def optimize_policy(self, itr, samples_data): policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log("Computing loss before") loss_before = self.optimizer.loss(policy_opt_input_values) logger.log("Computing KL before") policy_kl_before = self.f_policy_kl(*policy_opt_input_values) logger.log("Optimizing") self.optimizer.optimize(policy_opt_input_values) logger.log("Computing KL after") policy_kl = self.f_policy_kl(*policy_opt_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(policy_opt_input_values) logger.record_tabular("{}/LossBefore".format(self.policy.name), loss_before) logger.record_tabular("{}/LossAfter".format(self.policy.name), loss_after) logger.record_tabular("{}/dLoss".format(self.policy.name), loss_before - loss_after) logger.record_tabular("{}/KLBefore".format(self.policy.name), policy_kl_before) logger.record_tabular("{}/KL".format(self.policy.name), policy_kl) pol_ent = self.f_policy_entropy(*policy_opt_input_values) logger.record_tabular("{}/Entropy".format(self.policy.name), np.mean(pol_ent)) num_traj = self.batch_size // self.max_path_length actions = samples_data["actions"][:num_traj, ...] logger.record_histogram("{}/Actions".format(self.policy.name), actions) self._fit_baseline(samples_data)
def log_diagnostics(self, paths): arm_dists = [p['env_infos'][-1]['arm_distance'] for p in paths] goal_dists = [p['env_infos'][-1]['goal_distance'] for p in paths] logger.record_tabular('FinalArmDistanceAvg', np.mean(arm_dists)) logger.record_tabular('FinalArmDistanceMax', np.max(arm_dists)) logger.record_tabular('FinalArmDistanceMin', np.min(arm_dists)) logger.record_tabular('FinalArmDistanceStd', np.std(arm_dists)) logger.record_tabular('FinalGoalDistanceAvg', np.mean(goal_dists)) logger.record_tabular('FinalGoalDistanceMax', np.max(goal_dists)) logger.record_tabular('FinalGoalDistanceMin', np.min(goal_dists)) logger.record_tabular('FinalGoalDistanceStd', np.std(goal_dists))
def log_diagnostics(self): logger.record_tabular('pool-size', self.pool.size)
def evaluate(self, policy_opt_input_values, samples_data): # Everything else rewards_tensor = self.f_rewards(*policy_opt_input_values) returns_tensor = self.f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor) # TODO # TODO: check the squeeze/dimension handling for both convolutions paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] env_rewards = [path['rewards'] for path in paths] env_rewards = tensor_utils.concat_tensor_list(env_rewards.copy()) env_returns = [path['returns'] for path in paths] env_returns = tensor_utils.concat_tensor_list(env_returns.copy()) env_average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = tensor_utils.concat_tensor_list(aug_rewards) aug_returns = tensor_utils.concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) logger.record_tabular('Policy/EntRewards', d_rewards) aug_average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) logger.record_tabular('Policy/EntReturns', d_returns) # Calculate explained variance ev = special.explained_variance_1d(np.concatenate(baselines), aug_returns) logger.record_tabular('Baseline/ExplainedVariance', ev) inference_rmse = (samples_data['trajectory_infos']['mean'] - samples_data['latents'])**2. inference_rmse = np.sqrt(inference_rmse.mean()) logger.record_tabular('Inference/RMSE', inference_rmse) inference_rrse = rrse(samples_data['latents'], samples_data['trajectory_infos']['mean']) logger.record_tabular('Inference/RRSE', inference_rrse) embed_ent = self.f_embedding_entropy(*policy_opt_input_values) logger.record_tabular('Embedding/Entropy', embed_ent) infer_ce = self.f_inference_ce(*policy_opt_input_values) logger.record_tabular('Inference/CrossEntropy', infer_ce) pol_ent = self.f_policy_entropy(*policy_opt_input_values) logger.record_tabular('Policy/Entropy', pol_ent) #task_ents = self.f_task_entropies(*policy_opt_input_values) #tasks = samples_data["tasks"][:, 0, :] #_, task_indices = np.nonzero(tasks) #path_lengths = np.sum(samples_data["valids"], axis=1) #for t in range(self.policy.n_tasks): #lengths = path_lengths[task_indices == t] #completed = lengths < self.max_path_length #pct_completed = np.mean(completed) #num_samples = np.sum(lengths) #num_trajs = lengths.shape[0] #logger.record_tabular('Tasks/EpisodeLength/t={}'.format(t), # np.mean(lengths)) # logger.record_tabular('Tasks/CompletionRate/t={}'.format(t), # pct_completed) # logger.record_tabular('Tasks/NumSamples/t={}'.format(t), # num_samples) # logger.record_tabular('Tasks/NumTrajs/t={}'.format(t), num_trajs) # logger.record_tabular('Tasks/Entropy/t={}'.format(t), task_ents[t]) return samples_data
def train_policy_and_embedding_networks(self, policy_opt_input_values): """ Joint optimization of policy and embedding networks """ logger.log("Computing loss before") loss_before = self.optimizer.loss(policy_opt_input_values) logger.log("Computing KL before") policy_kl_before = self.f_policy_kl(*policy_opt_input_values) embed_kl_before = self.f_embedding_kl(*policy_opt_input_values) logger.log("Optimizing") self.optimizer.optimize(policy_opt_input_values) logger.log("Computing KL after") policy_kl = self.f_policy_kl(*policy_opt_input_values) embed_kl = self.f_embedding_kl(*policy_opt_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(policy_opt_input_values) logger.record_tabular('Policy/LossBefore', loss_before) logger.record_tabular('Policy/LossAfter', loss_after) logger.record_tabular('Policy/KLBefore', policy_kl_before) logger.record_tabular('Policy/KL', policy_kl) logger.record_tabular('Policy/dLoss', loss_before - loss_after) logger.record_tabular('Embedding/KLBefore', embed_kl_before) logger.record_tabular('Embedding/KL', embed_kl) return loss_after
def log_diagnostics(self): super(SimpleSampler, self).log_diagnostics() logger.record_tabular('max-path-return', self._max_path_return) logger.record_tabular('last-path-return', self._last_path_return) logger.record_tabular('episodes', self._n_episodes) logger.record_tabular('total-samples', self._total_samples)
def _evaluate(self, policy, evaluation_env): """Perform evaluation for the current policy.""" if self._eval_n_episodes < 1: return # TODO: max_path_length should be a property of environment. paths = rollouts(evaluation_env, policy, self.sampler._max_path_length, self._eval_n_episodes) total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) evaluation_env.log_diagnostics(paths) if self._eval_render: evaluation_env.render(paths) if self.sampler.batch_ready(): batch = self.sampler.random_batch() self.log_diagnostics(batch)
def log_diagnostics(self): logger.record_tabular('max-path-return', self._max_path_return) logger.record_tabular('last-path-return', self._last_path_return) logger.record_tabular('pool-size', self.pool.size) logger.record_tabular('episodes', self._n_episodes) logger.record_tabular('total-samples', self._total_samples)