def do_training(self, itr, batch): obs, actions, rewards, next_obs, terminals = ext.extract( batch, "observations", "actions", "rewards", "next_observations", "terminals") # compute the on-policy y values target_qf = self.opt_info["target_qf"] target_policy = self.opt_info["target_policy"] next_actions, _ = target_policy.get_actions(next_obs) next_qvals = target_qf.get_qval(next_obs, next_actions) ys = rewards + (1. - terminals) * self.discount * next_qvals f_train_qf = self.opt_info["f_train_qf"] f_train_policy = self.opt_info["f_train_policy"] qf_loss, qval = f_train_qf(ys, obs, actions) policy_surr = f_train_policy(obs) target_policy.set_param_values(target_policy.get_param_values() * (1.0 - self.soft_target_tau) + self.policy.get_param_values() * self.soft_target_tau) target_qf.set_param_values(target_qf.get_param_values() * (1.0 - self.soft_target_tau) + self.qf.get_param_values() * self.soft_target_tau) self.qf_loss_averages.append(qf_loss) self.policy_surr_averages.append(policy_surr) self.q_averages.append(qval) self.y_averages.append(ys)
def optimize_policy(self, itr, all_samples_data): logger.log("optimizing policy") assert len(all_samples_data) == self.num_grad_updates + 1 if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range(len(all_samples_data)): obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list if step == 0: init_inputs = input_list loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) loss_after = self.optimizer.loss(input_list) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[-1][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ]
def optimize_policy(self, itr, samples_data): all_input_values = tuple( ext.extract(samples_data, "observations", "actions", "advantages")) agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] all_input_values += tuple(state_info_list) + tuple(dist_info_list) if self.policy.recurrent: all_input_values += (samples_data["valids"], ) logger.log("Computing loss before") loss_before = self.optimizer.loss(all_input_values) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(all_input_values) logger.log("Optimizing") self.optimizer.optimize(all_input_values) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(all_input_values) logger.log("Computing loss after") loss_after = self.optimizer.loss(all_input_values) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('MeanKLBefore', mean_kl_before) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ sess = tf.get_default_session() num_tasks = len(samples) assert num_tasks == self.num_tasks input_list = list([] for _ in range(len(self.update_input_keys))) for i in range(num_tasks): inputs = ext.extract(samples[i], *self.update_input_keys) for j, input_name in enumerate(self.update_input_keys): if input_name == 'agent_infos': input_list[j].extend([ inputs[j][k] for k in self.distribution.dist_info_keys ]) else: input_list[j].append(inputs[j]) inputs = sum(input_list, []) feed_dict_inputs = list(zip(self.input_list_for_grad, inputs)) feed_dict_params = list( (self.all_params_ph[i][key], self.all_param_vals[i][key]) for i in range(num_tasks) for key in self.all_params_ph[0].keys()) feed_dict = dict(feed_dict_inputs + feed_dict_params) self.all_param_vals, gradients = sess.run( [self.all_fast_params_tensor, self._all_param_gradients], feed_dict=feed_dict)
def __setstate__(self, d): super(ReplayPool, self).__setstate__(d) self.bottom, self.top, self.size, self.observations, self.actions, \ self.rewards, self.terminals, self.extras, self.rng = extract( d, "bottom", "top", "size", "observations", "actions", "rewards", "terminals", "extras", "rng" )
def optimize_policy(self, itr, all_samples_data, log=True): assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[ self.kl_constrain_step][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] input_list += tuple(dist_info_list) if log: logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) if log: logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) if log: logger.log("Optimizing") self.optimizer.optimize(input_list) if log: logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: if log: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) if log: logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! if log: logger.record_tabular('MeanKL', mean_kl) if log: logger.record_tabular('LossBefore', loss_before) if log: logger.record_tabular('LossAfter', loss_after) if log: logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def optimize_policy(self, itr, all_samples_data, log=True): assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list, dist_info_list = [], [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) agent_infos = all_samples_data[step][i]['agent_infos'] dist_info_list.extend([ agent_infos[k] for k in self.policy.distribution.dist_info_keys ]) input_list += obs_list + action_list + adv_list + dist_info_list # [ [obs_0], [act_0], [adv_0], [dist_0], [obs_1], ... ] kl_coeff = tuple(self.kl_coeff) if log: logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list, extra_inputs=kl_coeff) if log: logger.log("Optimizing") self.optimizer.optimize(input_list, extra_inputs=kl_coeff) if log: logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list, extra_inputs=kl_coeff) if log: logger.log("Updating KL loss coefficients") # sess = tf.get_default_session() kls = self.optimizer.inner_kl( input_list, extra_inputs=kl_coeff ) # sess.run(self.kl_list, dict(list(zip(self.optimizer._input_vars, input_list + self.kl_coeff)))) for i, kl in enumerate(kls): if kl < self.target_inner_step / 1.5: self.kl_coeff[i] /= 2 if kl > self.target_inner_step * 1.5: self.kl_coeff[i] *= 2 if self.use_maml: if log: logger.record_tabular('LossBefore', loss_before) if log: logger.record_tabular('LossAfter', loss_after) if log: logger.record_tabular('dLoss', loss_before - loss_after) if log: logger.record_tabular('klDiff', np.mean(kls)) return dict()
def optimize_policy(self, itr, samples_data): logger.log("optimizing policy") inputs = ext.extract(samples_data, "observations", "actions", "advantages") agent_infos = samples_data["agent_infos"] state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] inputs += tuple(state_info_list) if self.policy.recurrent: inputs += (samples_data["valids"], ) dist_info_list = [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] loss_before = self.optimizer.loss(inputs) self.optimizer.optimize(inputs) loss_after = self.optimizer.loss(inputs) logger.record_tabular("LossBefore", loss_before) logger.record_tabular("LossAfter", loss_after) mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('MaxKL', max_kl)
def optimize_policy(self, itr, all_samples_data, log=True): assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list, dist_info_list = [], [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], *self._optimization_keys) obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) dist_info_list.extend([ inputs[3][k] for k in self.policy.distribution.dist_info_keys ]) input_list += obs_list + action_list + adv_list + dist_info_list # [ [obs_0], [act_0], [adv_0], [dist_0], [obs_1], ... ] kl_coeff = tuple(self.kl_coeff) if not self.clip_outer: kl_coeff += tuple(self.outer_kl_coeff) if log: logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list, extra_inputs=kl_coeff) if log: logger.log("Optimizing") self.optimizer.optimize(input_list, extra_inputs=kl_coeff) if log: logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list, extra_inputs=kl_coeff) inner_kls = self.optimizer.inner_kl(input_list, extra_inputs=kl_coeff) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating KL loss coefficients") for i, kl in enumerate(inner_kls): if kl < self.target_inner_step / 1.5: self.kl_coeff[i] /= 2 if kl > self.target_inner_step * 1.5: self.kl_coeff[i] *= 2 outer_kls = self.optimizer.outer_kl(input_list, extra_inputs=kl_coeff) if self.adaptive_outer_kl_penalty: if log: logger.log("Updating KL loss coefficients") for i, kl in enumerate(outer_kls): if kl < self.target_outer_step / 1.5: self.outer_kl_coeff[i] /= 2 if kl > self.target_outer_step * 1.5: self.outer_kl_coeff[i] *= 2 if self.use_maml and log: logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) logger.record_tabular('klDiff', np.mean(inner_kls)) if not self.clip_outer: logger.record_tabular('outerklDiff', np.mean(outer_kls)) return dict()
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ start = time.time() num_tasks = len(samples) param_keys = self.all_params.keys() update_param_keys = param_keys no_update_param_keys = [] sess = tf.get_default_session() obs_list, action_list, adv_list, distr_list = [], [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages', 'agent_infos') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) distr_list.extend(inputs[3][k] for k in self.distribution.dist_info_keys) inputs = obs_list + action_list + adv_list + distr_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. if self.first_inner_step: # skip this in first iteration self.init_param_values = self.get_variable_values(self.all_params) self.all_param_vals = [self.get_variable_values(self.all_params) for _ in range(num_tasks)] if self.params_ph is None: self.params_ph = [OrderedDict([(key, tf.placeholder(tf.float32, shape=value.shape)) for key, value in self.all_params.items()]) for _ in range(num_tasks)] if 'all_fast_params_tensor' not in dir(self): # only enter if first iteration # make computation graph once self.all_fast_params_tensor = [] # compute gradients for a current task (symbolic) for i in range(num_tasks): # compute gradients for a current task (symbolic) for key in self.all_params.keys(): tf.assign(self.all_params[key], self.params_ph[i][key]) gradients = dict(zip(update_param_keys, tf.gradients(self.surr_objs[i], [self.all_params[key] for key in update_param_keys]))) # gradient update for params of current task (symbolic) fast_params_tensor = OrderedDict(zip(update_param_keys, [self.all_params[key] - tf.multiply( self.param_step_sizes[key + "_step_size"], gradients[key]) for key in update_param_keys])) # add step sizes to fast_params_tensor fast_params_tensor.update(self.param_step_sizes) # undo gradient update for no_update_params (symbolic) for k in no_update_param_keys: fast_params_tensor[k] = self.all_params[k] # tensors that represent the updated params for all of the tasks (symbolic) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values # these are the updated values of the params after the gradient step feed_dict = list(zip(self.input_list_for_grad, inputs)) feed_dict_params = list((self.params_ph[task][key], self.all_param_vals[task][key]) for task in range(num_tasks) for key in self.params_ph[0].keys()) feed_dict = dict(feed_dict + feed_dict_params) self.all_param_vals = sess.run(self.all_fast_params_tensor, feed_dict=feed_dict) if self.all_param_ph is None: self.all_param_ph = [OrderedDict([(key, tf.placeholder(tf.float32, shape=value.shape)) for key, value in self.all_param_vals[0].items()]) for _ in range(num_tasks)] # reset parameters to original ones self.assign_params(self.all_params, self.init_param_values) # compile the _cur_f_dist with updated params if not self.compiled: outputs = [] with tf.variable_scope("post_updated_policy"): inputs = tf.split(self.input_tensor, num_tasks, 0) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_ph[i], is_training=False) outputs.append([info['mean'], info['log_std']]) self.__cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor, self.param_noise_std_ph] + sum([list(param_ph.values()) for param_ph in self.all_param_ph], []), outputs=outputs, ) self.compiled = True self._cur_f_dist = self.__cur_f_dist self.first_inner_step = False
def compute_updated_dists(self, samples): """ Compute fast gradients once and pull them out of tensorflow for sampling. """ num_tasks = len(samples) param_keys = self.all_params.keys() sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir(self): # make computation graph once self.all_fast_params_tensor = [] for i in range(num_tasks): gradients = dict( zip( param_keys, tf.gradients( self.surr_objs[i], [self.all_params[key] for key in param_keys]))) fast_params_tensor = dict( zip(param_keys, [ self.all_params[key] - step_size * gradients[key] for key in param_keys ])) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once self.all_param_vals = sess.run( self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) if init_param_values is not None: self.assign_params(self.all_params, init_param_values) outputs = [] inputs = tf.split(0, num_tasks, self._l_obs) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['prob']]) self._cur_f_prob = tensor_utils.compile_function( inputs=[self._l_obs], outputs=outputs, )
def optimize_policy(self, itr, all_samples_data): """ :param itr: (int) iteration :param all_samples_data: list which length corresponds to num_grad_updates+1 (contains the data collected by the pre- and post-update policies. Each entry of the list is a dict which number of entries corresponds to the meta-batch size. Each dict contains numpy arrays with actions, advantages, observations, returns, rewards & env_infos & agent_infos (i.e. log_std, mean) :return: """ assert len( all_samples_data ) == self.num_grad_updates + 1 # we collected the rollouts to compute the grads and then the test! if not self.use_maml: all_samples_data = [all_samples_data[0]] input_list = [] for step in range( len(all_samples_data)): # these are the gradient steps obs_list, action_list, adv_list = [], [], [] for i in range(self.meta_batch_size): inputs = ext.extract(all_samples_data[step][i], "observations", "actions", "advantages") obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) input_list += obs_list + action_list + adv_list # [ [obs_0], [act_0], [adv_0], [obs_1], ... ] if step == 0: ##CF not used? init_inputs = input_list if self.use_maml: dist_info_list = [] for i in range(self.meta_batch_size): agent_infos = all_samples_data[ self.kl_constrain_step][i]['agent_infos'] dist_info_list += [ agent_infos[k] for k in self.policy.distribution.dist_info_keys ] input_list += tuple(dist_info_list) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(input_list) logger.log("Computing loss before") loss_before = self.optimizer.loss(input_list) logger.log("Optimizing") self.optimizer.optimize(input_list) logger.log("Computing loss after") loss_after = self.optimizer.loss(input_list) if self.use_maml: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_list) logger.record_tabular('MeanKLBefore', mean_kl_before) # this now won't be 0! logger.record_tabular('MeanKL', mean_kl) logger.record_tabular('LossBefore', loss_before) logger.record_tabular('LossAfter', loss_after) logger.record_tabular('dLoss', loss_before - loss_after) return dict()
def compute_updated_dists(self, samples): """ Compute fast gradients once per iteration and pull them out of tensorflow for sampling with the post-update policy. """ start = time.time() num_tasks = len(samples) param_keys = self.all_params.keys() update_param_keys = param_keys no_update_param_keys = [] sess = tf.get_default_session() obs_list, action_list, adv_list = [], [], [] for i in range(num_tasks): inputs = ext.extract(samples[i], 'observations', 'actions', 'advantages') obs_list.append(inputs[0]) action_list.append(inputs[1]) adv_list.append(inputs[2]) inputs = obs_list + action_list + adv_list # To do a second update, replace self.all_params below with the params that were used to collect the policy. init_param_values = None if self.all_param_vals is not None: # skip this in first iteration init_param_values = self.get_variable_values(self.all_params) step_size = self.step_size for i in range(num_tasks): if self.all_param_vals is not None: # skip this in first iteration self.assign_params(self.all_params, self.all_param_vals[i]) if 'all_fast_params_tensor' not in dir( self): # only enter if first iteration # make computation graph once self.all_fast_params_tensor = [] # compute gradients for a current task (symbolic) for i in range(num_tasks): # compute gradients for a current task (symbolic) gradients = dict( zip( update_param_keys, tf.gradients(self.surr_objs[i], [ self.all_params[key] for key in update_param_keys ]))) # gradient update for params of current task (symbolic) fast_params_tensor = OrderedDict( zip(update_param_keys, [ self.all_params[key] - step_size * gradients[key] for key in update_param_keys ])) # undo gradient update for no_update_params (symbolic) for k in no_update_param_keys: fast_params_tensor[k] = self.all_params[k] # tensors that represent the updated params for all of the tasks (symbolic) self.all_fast_params_tensor.append(fast_params_tensor) # pull new param vals out of tensorflow, so gradient computation only done once ## first is the vars, second the values # these are the updated values of the params after the gradient step self.all_param_vals = sess.run( self.all_fast_params_tensor, feed_dict=dict(list(zip(self.input_list_for_grad, inputs)))) # reset parameters to original ones if init_param_values is not None: # skip this in first iteration self.assign_params(self.all_params, init_param_values) # compile the _cur_f_dist with updated params outputs = [] inputs = tf.split(self.input_tensor, num_tasks, 0) for i in range(num_tasks): # TODO - use a placeholder to feed in the params, so that we don't have to recompile every time. task_inp = inputs[i] info, _ = self.dist_info_sym(task_inp, dict(), all_params=self.all_param_vals[i], is_training=False) outputs.append([info['mean'], info['log_std']]) self._cur_f_dist = tensor_utils.compile_function( inputs=[self.input_tensor], outputs=outputs, ) total_time = time.time() - start logger.record_tabular("ComputeUpdatedDistTime", total_time)