def make_vars(self, stepnum='0'): # lists over the meta_batch_size obs_vars, action_vars, adv_vars, imp_vars = [], [], [], [] for i in range(self.meta_batch_size): obs_vars.append( self.env.observation_space.new_tensor_variable( 'obs' + stepnum + '_' + str(i), extra_dims=1, )) action_vars.append( self.env.action_space.new_tensor_variable( 'action' + stepnum + '_' + str(i), extra_dims=1, )) adv_vars.append( tensor_utils.new_tensor( name='advantage' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) imp_vars.append( tensor_utils.new_tensor( name='imp_ratios' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) return obs_vars, action_vars, adv_vars, imp_vars
def make_vars(self): obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) adv_var = tensor_utils.new_tensor( name='advantage', ndim=1, dtype=tf.float32, ) noise_var = tf.placeholder(dtype=tf.float32, shape=[None, self.latent_dim], name='noise') task_idx_var = tensor_utils.new_tensor( name='task_idx', ndim=1, dtype=tf.int32, ) return obs_var, action_var, adv_var, noise_var, task_idx_var
def make_vars(self, stepnum='0'): # lists over the meta_batch_size obs_vars, action_vars, adv_vars, noise_vars, task_idx_vars = [], [], [], [], [] for i in range(self.meta_batch_size): obs_vars.append( self.env.observation_space.new_tensor_variable( 'obs' + stepnum + '_' + str(i), extra_dims=1, )) action_vars.append( self.env.action_space.new_tensor_variable( 'action' + stepnum + '_' + str(i), extra_dims=1, )) adv_vars.append( tensor_utils.new_tensor( name='advantage' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) noise_vars.append( tf.placeholder(dtype=tf.float32, shape=[None, self.latent_dim], name='noise' + stepnum + '_' + str(i))) task_idx_vars.append( tensor_utils.new_tensor( name='task_idx' + stepnum + '_' + str(i), ndim=1, dtype=tf.int32, )) return obs_vars, action_vars, adv_vars, noise_vars, task_idx_vars
def _init_graph(self, chunk_size): with self._graph.as_default(): with tf.variable_scope('SimilarityCalculator'): X = tensor_utils.new_tensor( 'X', ndim=2, dtype=tf.float32, ) pool = tensor_utils.new_tensor( 'pool', ndim=2, dtype=tf.float32, ) division_factor = tensor_utils.new_tensor( 'division_factor', ndim=0, dtype=tf.float32, ) inputs = [X, pool, division_factor] size = tf.shape(X)[0] if chunk_size is None: chunk_size = size chunk_size_float = tf.cast(chunk_size, tf.float32) else: chunk_size_float = float(chunk_size) array_size = tf.cast( tf.ceil(tf.cast(size, tf.float32) / chunk_size_float), tf.int32) ta_initial = tf.TensorArray(dtype=tf.float32, size=array_size, infer_shape=False) def _cond(idx, i, ta): return i < size def _body(idx, i, ta): until = tf.minimum(i + chunk_size, size) new_pdiffs = (X[i:until, tf.newaxis, :] - pool) squared_l2 = tf.reduce_sum(tf.square(new_pdiffs), axis=-1) part_similarities = tf.reduce_mean(tf.exp(-squared_l2 / division_factor), axis=1) return idx + 1, until, ta.write(idx, part_similarities) final_idx, final_i, ta = tf.while_loop( _cond, _body, loop_vars=[0, 0, ta_initial], parallel_iterations=1) result = ta.concat() self._get_result = tensor_utils.compile_function( inputs=inputs, outputs=result, )
def make_vars(self, stepnum='0'): # lists over the meta_batch_size # We should only need the last stepnum for meta-optimization. obs_vars, action_vars, adv_vars, rewards_vars, returns_vars, path_lengths_vars, expert_action_vars = [], [], [], [], [], [], [] for i in range(self.meta_batch_size): obs_vars.append( self.env.observation_space.new_tensor_variable( 'obs' + stepnum + '_' + str(i), extra_dims=1, add_to_flat_dim=(0 if self.extra_input is None else self.extra_input_dim), )) action_vars.append( self.env.action_space.new_tensor_variable( 'action' + stepnum + '_' + str(i), extra_dims=1, )) adv_vars.append( tensor_utils.new_tensor( 'advantage' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) if self.metalearn_baseline: rewards_vars.append( tensor_utils.new_tensor( 'rewards' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) returns_vars.append( tensor_utils.new_tensor( 'returns' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) # path_lengths_vars.append(tensor_utils.new_tensor( # 'path_lengths' + stepnum + '_' + str(i), # ndim=1, dtype=tf.float32, # )) expert_action_vars.append( tensor_utils.new_tensor( 'expert_actions' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) if not self.metalearn_baseline: return obs_vars, action_vars, adv_vars, expert_action_vars else: return obs_vars, action_vars, adv_vars, rewards_vars, returns_vars, expert_action_vars # path_lengths_vars before expert action
def make_vars(self, stepnum='0'): # lists over the meta_batch_size obs_vars, action_vars, adv_vars = [], [], [] for i in range(self.meta_batch_size): obs_vars.append( self.env.observation_space.new_tensor_variable( 'obs' + stepnum + '_' + str(i), extra_dims=1, )) action_vars.append( tf.placeholder(tf.float32, shape=[None] + [self.env.action_space.flat_dim * 20], name='action' + stepnum + '_' + str(i))) #action_vars.append(self.env.action_space.new_tensor_variable( # 'action' + stepnum + '_' + str(i), # extra_dims=1, #)) adv_vars.append( tensor_utils.new_tensor( name='advantage' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) return obs_vars, action_vars, adv_vars
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = - tf.reduce_mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = - tf.reduce_mean(lr * advantage_var) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def opt_helper(self, policy, optimizer): is_recurrent = int(policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent,) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent,) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32,) dist = policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = -tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var] + state_info_vars_list if is_recurrent: input_list.append(valid_var) optimizer.update_opt(loss=surr_obj, target=policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl],) opt_info = dict(f_kl=f_kl,) return opt_info
def init_opt(self): obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) # todo, delete this var loglik = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient surr_obj = -tf.reduce_mean(loglik * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list + old_dist_info_vars_list self.optimizer.update_opt(loss=surr_obj, target=self.policy, leq_constraint=(mean_kl, self.delta), inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, )
def make_vars_latent(self): # lists over the meta_batch_size adv_var = tensor_utils.new_tensor( name='advantage_latent', ndim=1, dtype=tf.float32, ) z_var = tf.placeholder(dtype=tf.float32, shape=[None, self.latent_dim], name='zs_latent') task_idx_var = tensor_utils.new_tensor( name='task_idx_latent', ndim=1, dtype=tf.int32, ) return adv_var, z_var, task_idx_var
def __init__(self, env_spec, reg_coeff=1e-5): self._coeffs = None self._reg_coeff = reg_coeff self.feature_mat = tensor_utils.new_tensor( 'feature_mat', ndim=2, dtype=tf.float32, ) self.returns = tensor_utils.new_tensor( 'returns', ndim=2, dtype=tf.float32, ) # import pdb; pdb.set_trace() ident = tf.identity(self.feature_mat) self.train_ops = tf.matrix_solve_ls(tf.square(self.feature_mat) + self._reg_coeff * ident, self.returns, fast=False) self.sess = tf.Session()
def make_vars_latent(self, stepnum='0'): # lists over the meta_batch_size adv_vars, z_vars, task_idx_vars = [], [], [] for i in range(self.meta_batch_size): adv_vars.append( tensor_utils.new_tensor( name='advantage_latent' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) z_vars.append( tf.placeholder(dtype=tf.float32, shape=[None, self.latent_dim], name='zs_latent' + stepnum + '_' + str(i))) task_idx_vars.append( tensor_utils.new_tensor( name='task_idx_latents' + stepnum + '_' + str(i), ndim=1, dtype=tf.int32, )) return adv_vars, z_vars, task_idx_vars
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] self.input_list_for_grad = [obs_var, action_var, advantage_var] + state_info_vars_list if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = - tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) self.surr_obj = surr_obj
def make_vars(self, stepnum='0'): # lists over the meta_batch_size obs_vars, action_vars, adv_vars = [], [], [] for i in range(self.meta_batch_size): obs_vars.append(self.env.observation_space.new_tensor_variable( 'obs' + stepnum + '_' + str(i), extra_dims=1, )) action_vars.append(self.env.action_space.new_tensor_variable( 'action' + stepnum + '_' + str(i), extra_dims=1, )) adv_vars.append(tensor_utils.new_tensor( name='advantage' + stepnum + '_' + str(i), ndim=1, dtype=tf.float32, )) return obs_vars, action_vars, adv_vars
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) nobs_var = self.env.observation_space.new_tensor_variable( 'nobs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) empw_var = tensor_utils.new_tensor( 'empowerment', ndim=2 + is_recurrent, dtype=tf.float32, ) input_list = [ obs_var, nobs_var, action_var, advantage_var, empw_var, ] dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) # dist_info_vars["mean"]=dist_info_vars["mean"]+empw_var q_input = tf.concat([obs_var, nobs_var], axis=1) q_dist_info_vars = self.qvar_model.dist_info_sym( q_input, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.pol_ent_wt > 0: if 'log_std' in dist_info_vars: log_std = dist_info_vars['log_std'] ent = tf.reduce_sum(log_std + tf.log(tf.sqrt(2 * np.pi * np.e)), reduction_indices=-1) elif 'prob' in dist_info_vars: prob = dist_info_vars['prob'] ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1) else: raise NotImplementedError() ent = tf.stop_gradient(ent) adv = advantage_var + self.pol_ent_wt * ent else: adv = advantage_var if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = -tf.reduce_sum( lr * adv * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = -tf.reduce_mean(lr * adv) if self.train_empw: print( "training empowerment========================================") pred = dist.log_likelihood(dist.sample(dist_info_vars), dist_info_vars) + empw_var target = dist.log_likelihood(dist.sample(q_dist_info_vars), q_dist_info_vars) # print("pred = {}, target={}".format(pred.shape, target.shape)) surr_loss = surr_loss + self.lambda_i * tf.losses.mean_squared_error( predictions=pred, labels=target) input_list += state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, add_to_flat_dim=(0 if self.extra_input is None else self.extra_input_dim), ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) # logli_old = dist.log_likelihood_sym(action_var, old_dist_info_vars) r__ = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) clip_frac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(r__ - 1.0), 0.2))) r_ = tf.clip_by_value(r__, 0.8, 1.2) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -tf.reduce_sum( r_ * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = -tf.reduce_mean(r_ * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) #self.policy.set_init_surr_obj(input_list, [surr_obj]) # debugging self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl, clip_frac, dist_info_vars['log_std']], ) self.opt_info = dict(f_kl=f_kl, )
def init_opt(self, name=''): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( name + 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( name + 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name + 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name + 'old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name + k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name=name + "valid") else: valid_var = None input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) if self.kl_sample_backups > 0: kl_obs_var = self.env.observation_space.new_tensor_variable( name + 'kl_obs', extra_dims=1 + is_recurrent, ) kl_old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name + 'kl_old_%s' % k) for k, shape in dist.dist_info_specs } kl_old_dist_info_vars_list = [ kl_old_dist_info_vars[k] for k in dist.dist_info_keys ] kl_state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name + 'kl_%s' % k) for k, shape in self.policy.state_info_specs } kl_state_info_vars_list = [ kl_state_info_vars[k] for k in self.policy.state_info_keys ] kl_dist_info_vars = self.policy.dist_info_sym( kl_obs_var, kl_state_info_vars) kl = dist.kl_sym(kl_old_dist_info_vars, kl_dist_info_vars) input_list += [ kl_obs_var ] + kl_state_info_vars_list + kl_old_dist_info_vars_list dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) else: dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if not self.qprop: if is_recurrent: mean_kl = tf.reduce_sum( kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = -tf.reduce_sum( lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = -tf.reduce_mean(lr * advantage_var) else: if is_recurrent: raise NotImplementedError eta_var = tensor_utils.new_tensor( 'eta', ndim=1 + is_recurrent, dtype=tf.float32, ) surr_loss = -tf.reduce_mean(lr * advantage_var) if self.qprop_nu > 0: surr_loss *= 1 - self.qprop_nu if self.sample_backups > 0 or not self.policy_sample_last: off_obs_var = self.env.observation_space.new_tensor_variable( name + 'off_obs', extra_dims=1 + is_recurrent, ) off_e_qval = self.qf.get_e_qval_sym(off_obs_var, self.policy, deterministic=True) input_list += [off_obs_var] surr_loss -= tf.reduce_mean(off_e_qval) # * eta_var) else: if not self.mqprop: # Originally, we subtract this value for the bias correction, but we don't do that if we want mqprop (no action-conditional baseline). e_qval = self.qf.get_e_qval_sym(obs_var, self.policy, deterministic=True) surr_loss -= tf.reduce_mean(e_qval * eta_var) mean_kl = tf.reduce_mean(kl) input_list += [eta_var] control_variate = self.qf.get_cv_sym(obs_var, action_var, self.policy) f_control_variate = tensor_utils.compile_function( inputs=[obs_var, action_var], outputs=control_variate, ) self.opt_info_qprop = dict(f_control_variate=f_control_variate, ) if self.ac_delta > 0: ac_obs_var = self.env.observation_space.new_tensor_variable( name + 'ac_obs', extra_dims=1 + is_recurrent, ) e_qval = self.qf.get_e_qval_sym(ac_obs_var, self.policy, deterministic=True) input_list += [ac_obs_var] surr_loss *= (1.0 - self.ac_delta) surr_loss -= self.ac_delta * tf.reduce_mean(e_qval) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") self.opt_info = dict(target_policy=self.policy, ) self.init_opt_critic() return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.pol_ent_wt > 0: if 'log_std' in dist_info_vars: log_std = dist_info_vars['log_std'] ent = tf.reduce_sum(log_std + tf.log(tf.sqrt(2 * np.pi * np.e)), reduction_indices=-1) elif 'prob' in dist_info_vars: prob = dist_info_vars['prob'] ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1) else: raise NotImplementedError() ent = tf.stop_gradient(ent) adv = advantage_var + self.pol_ent_wt * ent else: adv = advantage_var if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = -tf.reduce_sum( lr * adv * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = -tf.reduce_mean(lr * adv) input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): with tf.variable_scope("target_policy"): target_policy = Serializable.clone(self.policy) oracle_policy = self.oracle_policy with tf.variable_scope("target_qf"): target_qf = Serializable.clone(self.qf) with tf.variable_scope("target_gate_qf"): target_gate_qf = Serializable.clone(self.gate_qf) obs = self.obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) discrete_action = tensor_utils.new_tensor( 'discrete_action', ndim=2, dtype=tf.float32, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.qf.get_params(regularizable=True)]) policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval_novice = self.qf.get_qval_sym( obs, self.policy.get_novice_policy_sym(obs), deterministic=True) policy_qval_gate = self.discrete_qf.get_qval_sym( obs, self.policy.get_action_binary_gate_sym(obs), deterministic=True) qval = self.qf.get_qval_sym(obs, action) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term discrete_qval = self.gate_qf.get_qval_sym(obs, discrete_action) discrete_qf_loss = tf.reduce_mean(tf.square(yvar - discrete_qval)) discrete_qf_reg_loss = discrete_qf_loss + qf_weight_decay_term qf_input_list = [yvar, obs, action] discrete_qf_input_list = [yvar, obs, discrete_action] policy_input_list = [obs] policy_gate_input_list = [obs] gating_network = self.policy.get_action_binary_gate_sym(obs) policy_surr = -tf.reduce_mean(policy_qval_novice) policy_reg_surr = policy_surr + policy_weight_decay_term policy_gate_surr = -tf.reduce_mean( policy_qval_gate) + policy_weight_decay_term policy_reg_gate_surr = policy_gate_surr + policy_weight_decay_term self.qf_update_method.update_opt(loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) self.gate_qf_update_method.update_opt(loss=discrete_qf_reg_loss, target=self.gate_qf, inputs=discrete_qf_input_list) self.policy_update_method.update_opt(loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) self.policy_gate_update_method.update_opt( loss=policy_reg_gate_surr, target=self.policy, inputs=policy_gate_input_list) f_train_qf = tensor_utils.compile_function( inputs=qf_input_list, outputs=[qf_loss, qval, self.qf_update_method._train_op], ) f_train_discrete_qf = tensor_utils.compile_function( inputs=discrete_qf_input_list, outputs=[ discrete_qf_loss, discrete_qval, self.gate_qf_update_method._train_op ], ) f_train_policy = tensor_utils.compile_function( inputs=policy_input_list, outputs=[policy_surr, self.policy_update_method._train_op], ) f_train_policy_gate = tensor_utils.compile_function( inputs=policy_gate_input_list, outputs=[ policy_gate_surr, self.policy_gate_update_method._train_op, gating_network ], ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_discrete_qf=f_train_discrete_qf, f_train_policy=f_train_policy, f_train_policy_gate=f_train_policy_gate, target_qf=target_qf, target_gate_qf=target_gate_qf, target_policy=target_policy, oracle_policy=oracle_policy, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) # lists over the meta_batch_size0 context = tf.reshape(self.irl_model.reparam_latent_tile, [ self.meta_batch_size, -1, self.irl_model.T, self.irl_model.latent_dim ]) # if not self.train_irl: # context = tf.stop_gradient(context) obs_vars = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_vars = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_vars = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) clean_obs_vars = tf.placeholder( tf.float32, shape=[None] * (1 + is_recurrent) + [self.env.observation_space.flat_dim - self.irl_model.latent_dim], name='clean_obs') policy_input = tf.reshape( tf.concat([ tf.reshape(clean_obs_vars, [ self.meta_batch_size, -1, self.irl_model.T, self.env.observation_space.flat_dim - self.irl_model.latent_dim ]), context ], axis=-1), [-1, self.env.observation_space.flat_dim]) # input_list = obs_vars + action_vars + advantage_vars input_list = [clean_obs_vars] + [action_vars] + [advantage_vars] + [ self.irl_model.expert_traj_var ] dist = self.policy.distribution old_dist_info_vars_list, state_info_vars_list = [], [] old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list += [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='%s' % k) for k, shape in self.policy.state_info_specs } state_info_vars_list += [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_vars = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_vars = None surr_losses, mean_kls = [], [] # dist_info_vars = self.policy.dist_info_sym(obs_vars[i], state_info_vars[i]) dist_info_vars = self.policy.dist_info_sym(policy_input, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_vars, old_dist_info_vars, dist_info_vars) if self.pol_ent_wt > 0: if 'log_std' in dist_info_vars: log_std = dist_info_vars['log_std'] ent = tf.reduce_sum(log_std + tf.log(tf.sqrt(2 * np.pi * np.e)), reduction_indices=-1) elif 'prob' in dist_info_vars: prob = dist_info_vars['prob'] ent = -tf.reduce_sum(prob * tf.log(prob), reduction_indices=-1) else: raise NotImplementedError() ent = tf.stop_gradient(ent) adv = advantage_vars + self.pol_ent_wt * ent else: adv = advantage_vars if is_recurrent: mean_kl = tf.reduce_sum( kl * valid_vars) / tf.reduce_sum(valid_vars) surr_loss = -tf.reduce_sum( lr * adv * valid_vars) / tf.reduce_sum(valid_vars) else: mean_kl = tf.reduce_mean(kl) surr_loss = -tf.reduce_mean(lr * adv) surr_losses.append(surr_loss) mean_kls.append(mean_kl) surr_loss = tf.reduce_mean(tf.stack( surr_losses, 0)) # mean over meta_batch_size (the diff tasks) mean_kl = tf.reduce_mean(tf.stack(mean_kls)) input_list += state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list += valid_vars self.optimizer.update_opt(loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl") return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) input_list = [ obs_var, action_var, advantage_var, ] dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.pol_ent_wt > 0: if 'log_std' in dist_info_vars: log_std = dist_info_vars['log_std'] ent = tf.reduce_sum(log_std + tf.log(tf.sqrt(2 * np.pi * np.e)), reduction_indices=-1) elif 'prob' in dist_info_vars: prob = dist_info_vars['prob'] ent = -tf.reduce_sum(prob*tf.log(prob), reduction_indices=-1) else: raise NotImplementedError() ent = tf.stop_gradient(ent) adv = advantage_var + self.pol_ent_wt*ent else: adv = advantage_var if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = - tf.reduce_sum(lr * adv * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = - tf.reduce_mean(lr * adv) input_list += state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) return dict()
def init_opt(self): # First, create "target" policy and Q functions with tf.variable_scope("target_policy"): target_policy = Serializable.clone(self.policy) with tf.variable_scope("target_qf"): target_qf = Serializable.clone(self.qf) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True) policy_surr = -tf.reduce_mean(policy_qval) policy_reg_surr = policy_surr + policy_weight_decay_term qf_input_list = [yvar, obs, action] policy_input_list = [obs] self.qf_update_method.update_opt(loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) self.policy_update_method.update_opt(loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) f_train_qf = tensor_utils.compile_function( inputs=qf_input_list, outputs=[qf_loss, qval, self.qf_update_method._train_op], ) f_train_policy = tensor_utils.compile_function( inputs=policy_input_list, outputs=[policy_surr, self.policy_update_method._train_op], ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )
def init_opt(self): # First, create "target" policy and Q functions with tf.variable_scope("target_policy"): target_policy = Serializable.clone(self.policy) with tf.variable_scope("target_qf"): target_qf = Serializable.clone(self.qf) # y need to be computed first obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) # The yi values are computed separately as above and then passed to # the training functions below action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) obs_offpolicy = self.env.observation_space.new_tensor_variable( 'obs_offpolicy', extra_dims=1, ) action_offpolicy = self.env.action_space.new_tensor_variable( 'action_offpolicy', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) yvar_offpolicy = tensor_utils.new_tensor( 'ys_offpolicy', ndim=1, dtype=tf.float32, ) qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qval_off = self.qf.get_qval_sym(obs_offpolicy, action_offpolicy) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_loss_off = tf.reduce_mean(tf.square(yvar_offpolicy - qval_off)) # TODO: penalize dramatic changes in gating_func # if PENALIZE_GATING_DISTRIBUTION_DIVERGENCE: policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.policy.get_params(regularizable=True)]) policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True) policy_qval_off = self.qf.get_qval_sym( obs_offpolicy, self.policy.get_action_sym(obs_offpolicy), deterministic=True) policy_surr = -tf.reduce_mean(policy_qval) policy_surr_off = -tf.reduce_mean(policy_qval_off) if self.sigma_type == 'unified-gated' or self.sigma_type == 'unified-gated-decaying': print("Using Gated Sigma!") input_to_gates = tf.concat([obs, obs_offpolicy], axis=1) assert input_to_gates.get_shape().as_list()[-1] == obs.get_shape( ).as_list()[-1] + obs_offpolicy.get_shape().as_list()[-1] # TODO: right now this is a soft-gate, should make a hard-gate (options vs mixtures) gating_func = MLP( name="sigma_gate", output_dim=1, hidden_sizes=(64, 64), hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.sigmoid, input_var=input_to_gates, input_shape=tuple( input_to_gates.get_shape().as_list()[1:])).output elif self.sigma_type == 'unified': # sample a bernoulli random variable print("Using Bernoulli sigma!") gating_func = tf.cast(self.random_dist.sample(qf_loss.get_shape()), tf.float32) elif self.sigma_type == 'unified-decaying': print("Using decaying sigma!") gating_func = tf.train.exponential_decay(1.0, self.train_step, 20, 0.96, staircase=True) else: raise Exception("sigma type not supported") qf_inputs_list = [ yvar, obs, action, yvar_offpolicy, obs_offpolicy, action_offpolicy, self.train_step ] qf_reg_loss = qf_loss * (1.0 - gating_func) + qf_loss_off * ( gating_func) + qf_weight_decay_term policy_input_list = [obs, obs_offpolicy, self.train_step] policy_reg_surr = policy_surr * ( 1.0 - gating_func) + policy_surr_off * ( gating_func) + policy_weight_decay_term if self.sigma_type == 'unified-gated-decaying': print("Adding a decaying factor to gated sigma!") decaying_factor = tf.train.exponential_decay(.5, self.train_step, 20, 0.96, staircase=True) penalty = decaying_factor * tf.nn.l2_loss(gating_func) qf_reg_loss += penalty policy_reg_surr += penalty self.qf_update_method.update_opt(qf_reg_loss, target=self.qf, inputs=qf_inputs_list) self.policy_update_method.update_opt(policy_reg_surr, target=self.policy, inputs=policy_input_list) f_train_qf = tensor_utils.compile_function( inputs=qf_inputs_list, outputs=[qf_loss, qval, self.qf_update_method._train_op], ) f_train_policy = tensor_utils.compile_function( inputs=policy_input_list, outputs=[policy_surr, self.policy_update_method._train_op], ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, )
def init_opt(self, lambda_s=100, lambda_v=10, tau=.5): with tf.variable_scope("target_policy"): target_policy = Serializable.clone(self.policy) oracle_policy = self.oracle_policy with tf.variable_scope("target_qf"): target_qf = Serializable.clone(self.qf) obs = self.obs = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1, ) action = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) yvar = tensor_utils.new_tensor( 'ys', ndim=1, dtype=tf.float32, ) qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.qf.get_params(regularizable=True)]) qval = self.qf.get_qval_sym(obs, action) qf_loss = tf.reduce_mean(tf.square(yvar - qval)) qf_reg_loss = qf_loss + qf_weight_decay_term policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ sum([tf.reduce_sum(tf.square(param)) for param in self.policy.get_params(regularizable=True)]) qf_input_list = [yvar, obs, action] policy_input_list = [obs] obs_oracle = self.env.observation_space.new_tensor_variable( 'obs_oracle', extra_dims=1, ) action_oracle = self.env.action_space.new_tensor_variable( 'action_oracle', extra_dims=1, ) yvar_oracle = tensor_utils.new_tensor( 'ys_oracle', ndim=1, dtype=tf.float32, ) qval_oracle = self.qf.get_qval_sym(obs_oracle, action_oracle) qf_loss_oracle = tf.reduce_mean(tf.square(yvar_oracle - qval_oracle)) qf_reg_loss_oracle = qf_loss_oracle + qf_weight_decay_term policy_qval_novice = self.qf.get_qval_sym( obs, self.policy.get_novice_policy_sym(obs), deterministic=True) gating_network = self.policy.get_action_binary_gate_sym(obs) policy_qval_oracle = self.qf.get_qval_sym( obs, self.policy.get_action_oracle_sym(obs), deterministic=True) combined_losses = tf.concat([ tf.reshape(policy_qval_novice, [-1, 1]), tf.reshape(policy_qval_oracle, [-1, 1]) ], axis=1) combined_loss = -tf.reduce_mean(tf.reshape( tf.reduce_mean(combined_losses * gating_network, axis=1), [-1, 1]), axis=0) lambda_s_loss = tf.constant(0.0) if lambda_s > 0.0: lambda_s_loss = lambda_s * (tf.reduce_mean( (tf.reduce_mean(gating_network, axis=0) - tau)** 2) + tf.reduce_mean( (tf.reduce_mean(gating_network, axis=1) - tau)**2)) lambda_v_loss = tf.constant(0.0) if lambda_v > 0.0: mean0, var0 = tf.nn.moments(gating_network, axes=[0]) mean, var1 = tf.nn.moments(gating_network, axes=[1]) lambda_v_loss = -lambda_v * (tf.reduce_mean(var0) + tf.reduce_mean(var1)) combined_losses = tf.concat([ tf.reshape(policy_qval_novice, [-1, 1]), tf.reshape(policy_qval_oracle, [-1, 1]) ], axis=1) combined_loss = -tf.reduce_mean(tf.reshape( tf.reduce_mean(combined_losses * gating_network, axis=1), [-1, 1]), axis=0) lambda_s_loss = tf.constant(0.0) if lambda_s > 0.0: lambda_s_loss = lambda_s * (tf.reduce_mean( (tf.reduce_mean(gating_network, axis=0) - tau)** 2) + tf.reduce_mean( (tf.reduce_mean(gating_network, axis=1) - tau)**2)) lambda_v_loss = tf.constant(0.0) if lambda_v > 0.0: mean0, var0 = tf.nn.moments(gating_network, axes=[0]) mean, var1 = tf.nn.moments(gating_network, axes=[1]) lambda_v_loss = -lambda_v * (tf.reduce_mean(var0) + tf.reduce_mean(var1)) policy_surr = combined_loss policy_reg_surr = combined_loss + policy_weight_decay_term + lambda_s_loss + lambda_v_loss gf_input_list = [obs_oracle, action_oracle, yvar_oracle ] + qf_input_list self.qf_update_method.update_opt(loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) self.policy_update_method.update_opt(loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) f_train_qf = tensor_utils.compile_function( inputs=qf_input_list, outputs=[qf_loss, qval, self.qf_update_method._train_op], ) f_train_policy = tensor_utils.compile_function( inputs=policy_input_list, outputs=[ policy_surr, self.policy_update_method._train_op, gating_network ], ) self.opt_info = dict( f_train_qf=f_train_qf, f_train_policy=f_train_policy, target_qf=target_qf, target_policy=target_policy, oracle_policy=oracle_policy, )
def init_opt(self): ############################### # # Variable Definitions # ############################### all_task_dist_info_vars = [] all_obs_vars = [] for i, policy in enumerate(self.local_policies): task_obs_var = self.env_partitions[ i].observation_space.new_tensor_variable('obs%d' % i, extra_dims=1) task_dist_info_vars = [] for j, other_policy in enumerate(self.local_policies): state_info_vars = dict() # Not handling recurrent policies dist_info_vars = other_policy.dist_info_sym( task_obs_var, state_info_vars) task_dist_info_vars.append(dist_info_vars) all_obs_vars.append(task_obs_var) all_task_dist_info_vars.append(task_dist_info_vars) obs_var = self.env.observation_space.new_tensor_variable('obs', extra_dims=1) action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1) advantage_var = tensor_utils.new_tensor('advantage', ndim=1, dtype=tf.float32) old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] + list(shape), name='old_%s' % k) for k, shape in self.policy.distribution.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in self.policy.distribution.dist_info_keys ] input_list = [obs_var, action_var, advantage_var ] + old_dist_info_vars_list + all_obs_vars ############################### # # Local Policy Optimization # ############################### self.optimizers = [] self.metrics = [] for n, policy in enumerate(self.local_policies): state_info_vars = dict() dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars) dist = policy.distribution kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) surr_loss = -tf.reduce_mean(lr * advantage_var) if self.constrain_together: additional_loss = Metrics.kl_on_others( n, dist, all_task_dist_info_vars) else: additional_loss = tf.constant(0.0) local_loss = surr_loss + self.penalty * additional_loss kl_metric = tensor_utils.compile_function(inputs=input_list, outputs=additional_loss, log_name="KLPenalty%d" % n) self.metrics.append(kl_metric) mean_kl_constraint = tf.reduce_mean(kl) optimizer = self.optimizer_class(**self.optimizer_args) optimizer.update_opt( loss=local_loss, target=policy, leq_constraint=(mean_kl_constraint, self.step_size), inputs=input_list, constraint_name="mean_kl_%d" % n, ) self.optimizers.append(optimizer) ############################### # # Global Policy Optimization # ############################### # Behaviour Cloning Loss state_info_vars = dict() center_dist_info_vars = self.policy.dist_info_sym( obs_var, state_info_vars) behaviour_cloning_loss = tf.losses.mean_squared_error( action_var, center_dist_info_vars['mean']) self.center_optimizer = FirstOrderOptimizer(max_epochs=1, verbose=True, batch_size=1000) self.center_optimizer.update_opt(behaviour_cloning_loss, self.policy, [obs_var, action_var]) # TRPO Loss kl = dist.kl_sym(old_dist_info_vars, center_dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, center_dist_info_vars) center_trpo_loss = -tf.reduce_mean(lr * advantage_var) mean_kl_constraint = tf.reduce_mean(kl) optimizer = self.optimizer_class(**self.optimizer_args) optimizer.update_opt( loss=center_trpo_loss, target=self.policy, leq_constraint=(mean_kl_constraint, self.step_size), inputs=[obs_var, action_var, advantage_var] + old_dist_info_vars_list, constraint_name="mean_kl_center", ) self.center_trpo_optimizer = optimizer # Reset Local Policies to Global Policy assignment_operations = [] for policy in self.local_policies: for param_local, param_center in zip( policy.get_params_internal(), self.policy.get_params_internal()): if 'std' not in param_local.name: assignment_operations.append( tf.assign(param_local, param_center)) self.reset_to_center = tf.group(*assignment_operations) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = -tf.reduce_sum( logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = -tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var ] + state_info_vars_list if is_recurrent: input_list.append(valid_var) vars_info = { "mean_kl": mean_kl, "input_list": input_list, "obs_var": obs_var, "action_var": action_var, "advantage_var": advantage_var, "surr_loss": surr_obj, "dist_info_vars": dist_info_vars, "lr": logli, } if self.qprop: eta_var = tensor_utils.new_tensor( 'eta', ndim=1 + is_recurrent, dtype=tf.float32, ) qvalue = self.qf.get_e_qval_sym(vars_info["obs_var"], self.policy, deterministic=True) qprop_surr_loss = -tf.reduce_mean( vars_info["lr"] * vars_info["advantage_var"]) - tf.reduce_mean( qvalue * eta_var) input_list += [eta_var] self.optimizer.update_opt( loss=qprop_surr_loss, target=self.policy, inputs=input_list, ) control_variate = self.qf.get_cv_sym(obs_var, action_var, self.policy) f_control_variate = tensor_utils.compile_function( inputs=[obs_var, action_var], outputs=control_variate, ) self.opt_info_qprop = dict(f_control_variate=f_control_variate, ) else: self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict( f_kl=f_kl, target_policy=self.policy, ) self.init_opt_critic()
def init_opt(self, name=''): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( name + 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( name + 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name + 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name=name+"valid") else: valid_var = None input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) if self.kl_sample_backups > 0: kl_obs_var = self.env.observation_space.new_tensor_variable( name + 'kl_obs', extra_dims=1 + is_recurrent, ) kl_old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'kl_old_%s' % k) for k, shape in dist.dist_info_specs } kl_old_dist_info_vars_list = [kl_old_dist_info_vars[k] for k in dist.dist_info_keys] kl_state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=name+'kl_%s'%k) for k, shape in self.policy.state_info_specs } kl_state_info_vars_list = [kl_state_info_vars[k] for k in self.policy.state_info_keys] kl_dist_info_vars = self.policy.dist_info_sym(kl_obs_var, kl_state_info_vars) kl = dist.kl_sym(kl_old_dist_info_vars, kl_dist_info_vars) input_list += [kl_obs_var] + kl_state_info_vars_list + kl_old_dist_info_vars_list dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) else: dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) if self.qprop: if is_recurrent: raise NotImplementedError eta_var = tensor_utils.new_tensor( 'eta', ndim=1 + is_recurrent, dtype=tf.float32, ) surr_loss = -tf.reduce_mean(lr * advantage_var) if self.qprop_nu > 0: surr_loss *= 1-self.qprop_nu if self.sample_backups > 0 or not self.policy_sample_last: off_obs_var = self.env.observation_space.new_tensor_variable( name + 'off_obs', extra_dims=1 + is_recurrent, ) off_e_qval = self.qf.get_e_qval_sym(off_obs_var, self.policy, deterministic=True) input_list += [off_obs_var] surr_loss -= tf.reduce_mean(off_e_qval)# * eta_var) else: e_qval = self.qf.get_e_qval_sym(obs_var, self.policy, deterministic=True) surr_loss -= tf.reduce_mean(e_qval * eta_var) mean_kl = tf.reduce_mean(kl) input_list += [eta_var] control_variate = self.qf.get_cv_sym(obs_var, action_var, self.policy) f_control_variate = tensor_utils.compile_function( inputs=[obs_var, action_var], outputs=control_variate, ) self.opt_info_qprop = dict( f_control_variate=f_control_variate, ) elif self.phi: # Using stein control functional variate reduction if is_recurrent: raise NotImplementedError eta_var = tensor_utils.new_tensor( 'eta', ndim = 1 + is_recurrent, dtype=tf.float32, ) if isinstance(self.pf, ContinuousLinearPhiFunction): phival = self.pf.get_e_phival_sym(obs_var, self.policy, gradwrtmu=True, deterministic=True) surr_loss = -tf.reduce_mean(lr * advantage_var) - \ tf.reduce_mean(phival * eta_var) stein_phi = self.pf.get_phi_bar_sym(obs_var, action_var, self.policy) elif isinstance(self.pf, ContinuousQuadraticPhiFunction): dist_info = self.policy.dist_info_sym(obs_var) mean = dist_info["mean"] log_std = dist_info["log_std"] phi_derives = self.pf.get_phi_derive_sym(obs_var, action_var) surr_loss= -tf.reduce_mean(lr * advantage_var) mu_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \ phi_derives['phi_prime']) * mean, axis=1) var_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \ phi_derives['phi_double_prime']) * tf.exp(2.*log_std),axis=1) surr_loss = surr_loss + tf.reduce_mean(mu_loss) + \ tf.reduce_mean(var_loss) stein_phi = self.pf.get_phival_sym(obs_var, action_var) elif isinstance(self.pf, ContinuousMLPPhiFunction): dist_info = self.policy.dist_info_sym(obs_var) mean = dist_info['mean'] log_std = dist_info['log_std'] grad_info, _ = self.policy.get_grad_info_sym(obs_var, action_var) phi_derives = self.pf.get_phi_derive_sym(obs_var, action_var) surr_loss = -tf.reduce_mean(lr * advantage_var) mu_loss = - tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \ phi_derives['phi_prime']) * mean, axis=1) var_loss = -(- tf.reduce_sum(tf.stop_gradient(tf.expand_dims(lr * eta_var, axis=1) * \ .5 * grad_info['logpi_dmu'] * \ phi_derives['phi_prime']) * tf.exp(2.*log_std), axis=1)) surr_loss = surr_loss + tf.reduce_mean(mu_loss) + \ tf.reduce_mean(var_loss) stein_phi = self.pf.get_phival_sym(obs_var, action_var) else: raise NotImplementedError mean_kl = tf.reduce_mean(kl) input_list += [eta_var] f_stein_phi = tensor_utils.compile_function( inputs=[obs_var, action_var], outputs=stein_phi, ) self.opt_info_phi=dict( f_stein_phi=f_stein_phi ) elif not self.qprop and not self.phi: if is_recurrent: mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) else: mean_kl = tf.reduce_mean(kl) surr_loss = - tf.reduce_mean(lr * advantage_var) if self.ac_delta > 0: ac_obs_var = self.env.observation_space.new_tensor_variable( name + 'ac_obs', extra_dims=1 + is_recurrent, ) e_qval = self.qf.get_e_qval_sym(ac_obs_var, self.policy, deterministic=True) input_list += [ac_obs_var] surr_loss *= (1.0 - self.ac_delta) surr_loss -= self.ac_delta * tf.reduce_mean(e_qval) self.optimizer.update_opt( loss=surr_loss, target=self.policy, leq_constraint=(mean_kl, self.step_size), inputs=input_list, constraint_name="mean_kl" ) self.opt_info = dict( target_policy=self.policy, ) self.init_opt_critic() self.init_opt_phi() return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] kl_penalty_var = tf.Variable( initial_value=self.initial_kl_penalty, dtype=tf.float32, name="kl_penalty" ) if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=(None, None), name="valid") if hasattr(self.policy, "prob_network"): rnn_network = self.policy.prob_network state_dim = rnn_network.state_dim recurrent_layer = rnn_network.recurrent_layer state_init_param = rnn_network.state_init_param elif hasattr(self.policy, "head_network"): rnn_network = self.policy.head_network state_dim = rnn_network.state_dim recurrent_layer = rnn_network.recurrent_layer state_init_param = rnn_network.state_init_param else: state_dim = self.policy.l_rnn.state_dim recurrent_layer = self.policy.l_rnn state_init_param = tf.reshape(self.policy.l_rnn.cell.zero_state(1, dtype=tf.float32), (-1,)) state_var = tf.placeholder(tf.float32, (None, state_dim), "state") recurrent_state_output = dict() minibatch_dist_info_vars = self.policy.dist_info_sym( obs_var, state_info_vars, recurrent_state={recurrent_layer: state_var}, recurrent_state_output=recurrent_state_output, ) state_output = recurrent_state_output[recurrent_layer] if hasattr(self.policy, "prob_network") or hasattr(self.policy, "head_network"): final_state = tf.reverse(state_output, [1])[:, 0, :] else: final_state = state_output lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, minibatch_dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, minibatch_dist_info_vars) ent = tf.reduce_sum(dist.entropy_sym(minibatch_dist_info_vars) * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr) surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) clipped_surr_loss = - tf.reduce_sum( tf.minimum(lr * advantage_var, clipped_lr * advantage_var) * valid_var ) / tf.reduce_sum(valid_var) clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent if self.use_kl_penalty: clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size) self.optimizer.update_opt( loss=clipped_surr_pen_loss, target=self.policy, inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list + [ valid_var], rnn_init_state=state_init_param, rnn_state_input=state_var, rnn_final_state=final_state, diagnostic_vars=OrderedDict([ ("UnclippedSurrLoss", surr_loss), ("MeanKL", mean_kl), ]) ) else: dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) ent = tf.reduce_mean(dist.entropy_sym(dist_info_vars)) mean_kl = tf.reduce_mean(kl) clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr) surr_loss = - tf.reduce_mean(lr * advantage_var) clipped_surr_loss = - tf.reduce_mean( tf.minimum(lr * advantage_var, clipped_lr * advantage_var) ) clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent if self.use_kl_penalty: clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size) self.optimizer.update_opt( loss=clipped_surr_pen_loss, target=self.policy, inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list, diagnostic_vars=OrderedDict([ ("UnclippedSurrLoss", surr_loss), ("MeanKL", mean_kl), ]) ) self.kl_penalty_var = kl_penalty_var self.f_increase_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, tf.minimum(kl_penalty_var * self.increase_penalty_factor, self.max_penalty) ) ) self.f_decrease_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, tf.maximum(kl_penalty_var * self.decrease_penalty_factor, self.min_penalty) ) ) self.f_reset_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, self.initial_kl_penalty ) )
def init_experts_opt(self): ############################### # # Variable Definitions # ############################### all_task_dist_info_vars = [] all_obs_vars = [] for i, policy in enumerate(self.local_policies): task_obs_var = self.env_partitions[ i].observation_space.new_tensor_variable('obs%d' % i, extra_dims=1) task_dist_info_vars = [] for j, other_policy in enumerate(self.local_policies): state_info_vars = dict() # Not handling recurrent policies dist_info_vars = other_policy.dist_info_sym( task_obs_var, state_info_vars) task_dist_info_vars.append(dist_info_vars) all_obs_vars.append(task_obs_var) all_task_dist_info_vars.append(task_dist_info_vars) obs_var = self.env.observation_space.new_tensor_variable('obs', extra_dims=1) action_var = self.env.action_space.new_tensor_variable('action', extra_dims=1) advantage_var = tensor_utils.new_tensor('advantage', ndim=1, dtype=tf.float32) old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] + list(shape), name='old_%s' % k) for k, shape in self.policy.distribution.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in self.policy.distribution.dist_info_keys ] central_obs_vars = [elem[1] for elem in self.central_policy_dist_infos] input_list = [ obs_var, action_var, advantage_var ] + old_dist_info_vars_list + all_obs_vars + central_obs_vars ############################### # # Local Policy Optimization # ############################### self.optimizers = [] self.metrics = [] for n, policy in enumerate(self.local_policies): state_info_vars = dict() dist_info_vars = policy.dist_info_sym(obs_var, state_info_vars) dist = policy.distribution kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) surr_loss = -tf.reduce_mean(lr * advantage_var) if self.constrain_together: additional_loss = Metrics.kl_on_others( n, dist, all_task_dist_info_vars) elif self.constrain_against_central: additional_loss = Metrics.kl_on_central( dist, dist_info_vars, self.central_policy_dist_infos[n][0]) else: additional_loss = tf.constant(0.0) local_loss = surr_loss + self.penalty * additional_loss kl_metric = tensor_utils.compile_function(inputs=input_list, outputs=additional_loss, log_name="KLPenalty%d" % n) self.metrics.append(kl_metric) mean_kl_constraint = tf.reduce_mean(kl) optimizer = PenaltyLbfgsOptimizer(name='expertOptimizer_' + str(n)) optimizer.update_opt( loss=local_loss, target=policy, leq_constraint=(mean_kl_constraint, self.step_size), inputs=input_list, constraint_name="mean_kl_%d" % n, ) self.optimizers.append(optimizer) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( name='advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) logli = dist.log_likelihood_sym(action_var, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) # formulate as a minimization problem # The gradient of the surrogate objective is the policy gradient if is_recurrent: surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) max_kl = tf.reduce_max(kl * valid_var) else: surr_obj = - tf.reduce_mean(logli * advantage_var) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [obs_var, action_var, advantage_var] + state_info_vars_list if is_recurrent: input_list.append(valid_var) self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict( f_kl=f_kl, )
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] state_info_vars = { k: tf.placeholder(tf.float32, shape=(None,) * (1 + is_recurrent) + shape, name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] kl_penalty_var = tf.Variable( initial_value=self.initial_kl_penalty, dtype=tf.float32, name="kl_penalty" ) # TODO: The code below only works for FF policy. assert is_recurrent == 0 dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) ent = tf.reduce_mean(dist.entropy_sym(dist_info_vars)) mean_kl = tf.reduce_mean(kl) clipped_lr = tf.clip_by_value(lr, 1. - self.clip_lr, 1. + self.clip_lr) surr_loss = - tf.reduce_mean(lr * advantage_var) clipped_surr_loss = - tf.reduce_mean( tf.minimum(lr * advantage_var, clipped_lr * advantage_var) ) clipped_surr_pen_loss = clipped_surr_loss - self.entropy_bonus_coeff * ent if self.use_kl_penalty: clipped_surr_pen_loss += kl_penalty_var * tf.maximum(0., mean_kl - self.step_size) self.optimizer.update_opt( loss=clipped_surr_pen_loss, target=self.policy, inputs=[obs_var, action_var, advantage_var] + state_info_vars_list + old_dist_info_vars_list, diagnostic_vars=OrderedDict([ ("UnclippedSurrLoss", surr_loss), ("MeanKL", mean_kl), ]) ) self.kl_penalty_var = kl_penalty_var self.f_increase_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, tf.minimum(kl_penalty_var * self.increase_penalty_factor, self.max_penalty) ) ) self.f_decrease_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, tf.maximum(kl_penalty_var * self.decrease_penalty_factor, self.min_penalty) ) ) self.f_reset_penalty = tensor_utils.compile_function( inputs=[], outputs=tf.assign( kl_penalty_var, self.initial_kl_penalty ) ) return dict()
def init_opt(self): is_recurrent = int(self.policy.recurrent) obs_var = self.env.observation_space.new_tensor_variable( 'obs', extra_dims=1 + is_recurrent, ) action_var = self.env.action_space.new_tensor_variable( 'action', extra_dims=1 + is_recurrent, ) advantage_var = tensor_utils.new_tensor( 'advantage', ndim=1 + is_recurrent, dtype=tf.float32, ) dist = self.policy.distribution old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] if is_recurrent: valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") else: valid_var = None dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) # entropy_bonus = sum(list(entropy_list[j][i] for j in range(self.num_grad_updates))) entropy = dist.entropy_sym(dist_info_vars) clipped_obj = tf.minimum( lr * advantage_var, tf.clip_by_value(lr, 1 - self.clip_eps, 1 + self.clip_eps) * advantage_var) if is_recurrent: mean_entropy = tf.reduce_sum(entropy) / tf.reduce_sum(valid_var) mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) surr_loss = - tf.reduce_sum(clipped_obj * valid_var) / tf.reduce_sum(valid_var) \ + self.kl_coeff * mean_kl - self.entropy_coeff * mean_entropy else: mean_entropy = tf.reduce_mean(entropy) mean_kl = tf.reduce_mean(kl) surr_loss = -tf.reduce_mean( clipped_obj ) + self.kl_coeff * mean_kl - self.entropy_coeff * mean_entropy input_list = [ obs_var, action_var, advantage_var, ] + state_info_vars_list + old_dist_info_vars_list if is_recurrent: input_list.append(valid_var) extra_inputs = [tf.placeholder(tf.float32, shape=[], name='kl_coeff')] self.optimizer.update_opt(loss=surr_loss, target=self.policy, kl=mean_kl, inputs=input_list, extra_inputs=extra_inputs) return dict()
def init_opt(self): observations = self.env.observation_space.new_tensor_variable( 'observations', extra_dims=1, ) actions = self.env.action_space.new_tensor_variable( 'action', extra_dims=1, ) advantage = tensor_utils.new_tensor( name='advantage', ndim=1, dtype=tf.float32, ) dist = self.policy.distribution self.loss = tf.placeholder(tf.float32, name='actor_loss') self.entropy_loss = tf.placeholder(tf.float32, name='entropy_loss') self.avg_rewards = tf.placeholder(tf.float32, name='avg_rewards') self.total_rewards = tf.placeholder(tf.float32, name='total_rewards') old_dist_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * 1 + list(shape), name='old_%s' % k) for k, shape in dist.dist_info_specs } old_dist_info_vars_list = [ old_dist_info_vars[k] for k in dist.dist_info_keys ] state_info_vars = { k: tf.placeholder(tf.float32, shape=[None] * 1 + list(shape), name=k) for k, shape in self.policy.state_info_specs } state_info_vars_list = [ state_info_vars[k] for k in self.policy.state_info_keys ] dist_info_vars = self.policy.dist_info_sym(observations, state_info_vars) logli = dist.log_likelihood_sym(actions, dist_info_vars) kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) loss = -tf.reduce_mean(logli * advantage) mean_kl = tf.reduce_mean(kl) max_kl = tf.reduce_max(kl) input_list = [observations, actions, advantage] + state_info_vars_list self.optimizer.update_opt(loss=loss, target=self.policy, inputs=input_list) f_kl = tensor_utils.compile_function( inputs=input_list + old_dist_info_vars_list, outputs=[mean_kl, max_kl], ) self.opt_info = dict(f_kl=f_kl, ) self.writer = tf.train.SummaryWriter("summary/") self.write_op = tf.merge_summary([ tf.scalar_summary("Loss", self.loss), tf.scalar_summary("Entropy Loss", self.entropy_loss), tf.scalar_summary("Total Rewards", self.total_rewards), tf.scalar_summary("Avg Rewards", self.avg_rewards) ])