def test_mpi_adam(): """ tests the MpiAdam object's functionality """ np.random.seed(0) tf.set_random_seed(0) a_var = tf.Variable(np.random.randn(3).astype('float32')) b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) learning_rate = 1e-2 update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) do_update = tf_utils.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for step in range(10): print(step, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a_var, b_var] lossandgrad = tf_utils.function( [], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for step in range(10): loss, grad = lossandgrad() adam.update(grad, learning_rate) print(step, loss)
def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"): """ reward regression from observations and transitions :param env: (Gym Environment) :param hidden_size: ([int]) the hidden dimension for the MLP :param entcoeff: (float) the entropy loss weight :param scope: (str) tensorflow variable scope """ self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([ o + a for o, a in zip(self.observation_shape, self.actions_shape) ]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() self.lossandgrad = tf_util.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'qf' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) _, qf_features = self.policy_tf.feature_matrices() singular_qf = tf.linalg.svd(qf_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1])) ### BSS LOSS ### normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name ] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/') ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad( self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'pi' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) pi_features, _ = self.policy_tf.feature_matrices() singular_pi = tf.linalg.svd(pi_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1])) ### BSS LOSS ### self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss actor_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/') ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad( self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=2, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) # env._seed(seed) gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app dirname = '{}_{}opts_saves/'.format(gamename, num_options) if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_main.py'] for i in range(len(files)): src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i] dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname shutil.copy2(src, dest) # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1) total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) results = [] if saves: results = open( gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if iters_so_far % 5 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) min_batch = 160 t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue # This part is only necessasry when we use options. # We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) termg = termloss(batch["ob"], [opt], tadv) adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def __init__(self, observation_space, action_space, hidden_size, entcoeff=0.001, scope="adversary", normalize=True): """ Reward regression from observations and transitions :param observation_space: (gym.spaces) :param action_space: (gym.spaces) :param hidden_size: ([int]) the hidden dimension for the MLP :param entcoeff: (float) the entropy loss weight :param scope: (str) tensorflow variable scope :param normalize: (bool) Whether to normalize the reward or not """ # TODO: support images properly (using a CNN) self.scope = scope self.observation_shape = observation_space.shape self.actions_shape = action_space.shape if isinstance(action_space, gym.spaces.Box): # Continuous action space self.discrete_actions = False self.n_actions = action_space.shape[0] elif isinstance(action_space, gym.spaces.Discrete): self.n_actions = action_space.n self.discrete_actions = True else: raise ValueError( 'Action space not supported: {}'.format(action_space)) self.hidden_size = hidden_size self.normalize = normalize self.obs_rms = None # Placeholders self.generator_obs_ph = tf.compat.v1.placeholder( observation_space.dtype, (None, ) + self.observation_shape, name="observations_ph") self.generator_acs_ph = tf.compat.v1.placeholder( action_space.dtype, (None, ) + self.actions_shape, name="actions_ph") self.expert_obs_ph = tf.compat.v1.placeholder( observation_space.dtype, (None, ) + self.observation_shape, name="expert_observations_ph") self.expert_acs_ph = tf.compat.v1.placeholder( action_space.dtype, (None, ) + self.actions_shape, name="expert_actions_ph") # Build graph generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(input_tensor=tf.cast( tf.nn.sigmoid(generator_logits) < 0.5, tf.float32)) expert_acc = tf.reduce_mean(input_tensor=tf.cast( tf.nn.sigmoid(expert_logits) > 0.5, tf.float32)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(input_tensor=generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(input_tensor=expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(input_tensor=logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.math.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) var_list = self.get_trainable_variables() self.lossandgrad = tf_util.function([ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
def setup_model(self): # prevent import loops with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() print("number of workers are", self.nworkers) self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) self._setup_learn(self.seed) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi with tf.variable_scope("phi", reuse=False): self.policy_phi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for phi old with tf.variable_scope("oldphi", reuse=False): self.policy_phi_old = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) vf_phi_err = tf.reduce_mean( tf.square(self.policy_phi.value_flat - ret)) vf_phi_old_err = tf.reduce_mean( tf.square(self.policy_phi_old.value_flat)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] all_var_oldpi_list = tf_util.get_trainable_vars("oldpi") var_oldpi_list = [ v for v in all_var_oldpi_list if "/vf" not in v.name and "/q/" not in v.name ] all_var_phi_list = tf_util.get_trainable_vars("phi") vf_phi_var_list = [ v for v in all_var_phi_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] all_var_phi_old_list = tf_util.get_trainable_vars("oldphi") vf_phi_old_var_list = [ v for v in all_var_phi_old_list if "/pi" not in v.name and "/logstd" not in v.name and "/q" not in v.name ] #print("vars", vf_var_list) self.policy_vars = all_var_list self.oldpolicy_vars = all_var_oldpi_list print("all var list", all_var_list) print("phi vars", vf_phi_var_list) print("phi old vars", vf_phi_old_var_list) self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) self.compute_vf_phi_lossandgrad = tf_util.function( [observation, self.policy_phi.obs_ph, ret], tf_util.flatgrad(vf_phi_err, vf_phi_var_list)) self.compute_vf_loss = tf_util.function( [observation, old_policy.obs_ph, ret], vferr) self.compute_vf_phi_loss = tf_util.function( [observation, self.policy_phi.obs_ph, ret], vf_phi_err) #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err) #self.phi_old_obs = np.array([-0.012815 , -0.02076313, 0.07524705, 0.09407324, 0.0901745 , -0.09339058, 0.03544853, -0.03297224]) #self.phi_old_obs = self.phi_old_obs.reshape((1, 8)) update_phi_old_expr = [] for var, var_target in zip( sorted(vf_phi_var_list, key=lambda v: v.name), sorted(vf_phi_old_var_list, key=lambda v: v.name)): update_phi_old_expr.append(var_target.assign(var)) update_phi_old_expr = tf.group(*update_phi_old_expr) self.update_phi_old = tf_util.function( [], [], updates=[update_phi_old_expr]) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield @contextmanager def temp_seed(seed): state = np.random.get_state() np.random.seed(seed) try: yield finally: np.random.set_state(state) def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess) self.vfadam.sync() self.vf_phi_adam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) self.timed = timed self.allmean = allmean self.temp_seed = temp_seed self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars( "model") + tf_util.get_trainable_vars("oldpi") self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Penalty related variable with tf.variable_scope('penalty'): cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8)) penalty_param = tf.get_variable('penalty_param', initializer=float(param_init), trainable=True, dtype=tf.float32) penalty = tf.nn.softplus(penalty_param) penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim)) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # # Network for safety value function # with tf.variable_Scope("vc",reuse=False): # self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret)) vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) # Surrogate for cost function surrcost = tf.reduce_mean(ratio * catarg) optimgain = surrgain + entbonus # Include surr_cost in pi_objective optimgain -= penalty * surrcost optimgain /= (1 + penalty) # # Loss function for pi is negative of pi_objective # optimgain = -optimgain # Should we?? losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 # Fisher vector products fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('penalty_loss', penalty_loss) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('constraint_cost_function_loss', surrcost) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg], fvp) # Why need all inputs? Might for implementation easiness # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret], # tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret], # tf_util.flatgrad(vcerr, vcf_var_list)) self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret], [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)]) self.compute_lagrangiangrad = tf_util.function([cur_cost_ph], tf_util.flatgrad(penalty_loss, [penalty_param])) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() # optimizer for constraint costs value function self.vcadam = MpiAdam(vcf_var_list, sess=self.sess) self.vcadam.sync() # optimizer for lagragian value of safe RL self.penaltyadam = MpiAdam([penalty_param], sess=self.sess) self.penaltyadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('discounted_costs', tf.reduce_mean(cret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('discounted_rewards', cret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('penalty_learning_rate', self.penalty_lr) tf.summary.histogram('advantage', atarg) tf.summary.histogram('cost_advantage', catarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier from stable_baselines.mdal.adversary import TabularAdversaryTF, NeuralAdversaryTRPO with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the MDPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # self._setup_learn(self.seed) self._setup_learn() if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) elif self.using_mdal: if self.neural: self.reward_giver = NeuralAdversaryTRPO(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) else: self.reward_giver = TabularAdversaryTF(self.sess, self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff, expert_features=self.expert_dataset.successor_features, exploration_bonus=self.exploration_bonus, bonus_coef=self.bonus_coef, t_c=self.t_c, is_action_features=self.is_action_features) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): self.old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for fitting closed form with tf.variable_scope("closedpi", reuse=False): self.closed_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) self.vtarg = tf.placeholder(dtype=tf.float32, shape=[None]) self.ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="learning_rate_ph") self.outer_learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name="outer_learning_rate_ph") self.old_vpred_ph = tf.placeholder(dtype=tf.float32, shape=[None], name="old_vpred_ph") self.clip_range_vf_ph = tf.placeholder(dtype=tf.float32, shape=[], name="clip_range_ph") observation = self.policy_pi.obs_ph self.action = self.policy_pi.pdtype.sample_placeholder([None]) if self.tsallis_q == 1.0: kloldnew = self.policy_pi.proba_distribution.kl(self.old_policy.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) else: logp_pi = self.policy_pi.proba_distribution.logp(self.action) logp_pi_old = self.old_policy.proba_distribution.logp(self.action) ent = self.policy_pi.proba_distribution.entropy() #kloldnew = self.policy_pi.proba_distribution.kl_tsallis(self.old_policy.proba_distribution, self.tsallis_q) tsallis_q = 2.0 - self.tsallis_q meankl = tf.reduce_mean(tf_log_q(tf.exp(logp_pi), tsallis_q) - tf_log_q(tf.exp(logp_pi_old), tsallis_q)) #tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent if self.cliprange_vf is None: vpred_clipped = self.policy_pi.value_flat else: vpred_clipped = self.old_vpred_ph + \ tf.clip_by_value(self.policy_pi.value_flat - self.old_vpred_ph, - self.clip_range_vf_ph, self.clip_range_vf_ph) vf_losses1 = tf.square(self.policy_pi.value_flat - self.ret) vf_losses2 = tf.square(vpred_clipped - self.ret) vferr = tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # advantage * pnew / pold ratio = tf.exp(self.policy_pi.proba_distribution.logp(self.action) - self.old_policy.proba_distribution.logp(self.action)) if self.method == "multistep-SGD": surrgain = tf.reduce_mean(ratio * self.atarg) - meankl / self.learning_rate_ph elif self.method == "closedreverse-KL": surrgain = tf.reduce_mean(tf.exp(self.atarg) * self.policy_pi.proba_distribution.logp(self.action)) else: policygain = tf.reduce_mean(tf.exp(self.atarg) * tf.log(self.closed_policy.proba_distribution.mean)) surrgain = tf.reduce_mean(ratio * self.atarg) - tf.reduce_mean(self.learning_rate_ph * ratio * self.policy_pi.proba_distribution.logp(self.action)) optimgain = surrgain #+ entbonus - self.learning_rate_ph * meankl losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name] vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name] print("policy vars", var_list) all_closed_var_list = tf_util.get_trainable_vars("closedpi") closed_var_list = [v for v in all_closed_var_list if "/vf" not in v.name and "/q" not in v.name] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape)) start += var_size gvp = tf.add_n([tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) # tf.summary.scalar('entropy_loss', meanent) # tf.summary.scalar('policy_gradient_loss', optimgain) # tf.summary.scalar('value_function_loss', surrgain) # tf.summary.scalar('approximate_kullback-leibler', meankl) # tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.learning_rate_ph, self.vtarg], losses) self.compute_fvp = tf_util.function([flat_tangent, observation, self.old_policy.obs_ph, self.action, self.atarg], fvp) self.compute_vflossandgrad = tf_util.function([observation, self.old_policy.obs_ph, self.ret, self.old_vpred_ph, self.clip_range_vf_ph], tf_util.flatgrad(vferr, vf_var_list)) grads = tf.gradients(-optimgain, var_list) grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5) trainer = tf.train.AdamOptimizer(learning_rate=self.outer_learning_rate_ph, epsilon=1e-5) # trainer = tf.train.AdamOptimizer(learning_rate=3e-4, epsilon=1e-5) grads = list(zip(grads, var_list)) self._train = trainer.apply_gradients(grads) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: # print(colorize(msg, color='magenta')) # start_time = time.time() yield # print(colorize("done in {:.3f} seconds".format((time.time() - start_time)), # color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail or self.using_mdal: self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(self.ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(self.atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', self.ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', self.atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi") if self.using_gail: self.params.extend(self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, self.old_policy.obs_ph, self.action, self.atarg, self.ret, self.learning_rate_ph, self.vtarg, self.closed_policy.obs_ph], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def _setup_model(self, rank, memory_size, alpha, obs_space, a_space, noise_target_action, **kwargs): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.use_prioritiy: from .priority_memory import PrioritizedMemory self.memory = PrioritizedMemory(capacity=memory_size, alpha=alpha) else: from .memory import Memory self.memory = Memory(limit=memory_size, action_shape=a_space.shape, observation_shape=obs_space.shape) # 定义 placeholders self.observe_Input = tf.placeholder(tf.float32, [None] + list(obs_space.shape), name='observe_Input') self.observe_Input_ = tf.placeholder(tf.float32, [None] + list(obs_space.shape), name='observe_Input_') self.R = tf.placeholder(tf.float32, [None, 1], 'r') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') self.n_step_steps = tf.placeholder(tf.float32, shape=(None, 1), name='n_step_reached') self.q_demo = tf.placeholder(tf.float32, [None, 1], name='Q_of_actions_from_memory') self.come_from_demo = tf.placeholder(tf.float32, [None, 1], name='Demo_index') self.action_memory = tf.placeholder(tf.float32, [None] + list(a_space.shape), name='actions_from_memory') with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=obs_space.shape) with tf.name_scope('obs_preprocess'): self.normalized_observe_Input = tf.clip_by_value( normalize(self.observe_Input, self.obs_rms), -5., 5.) self.normalized_observe_Input_ = tf.clip_by_value( normalize(self.observe_Input_, self.obs_rms), -5., 5.) with tf.variable_scope('Actor'): self.action = self.build_actor(self.normalized_observe_Input, scope='eval', trainable=True, a_space=a_space) self.action_ = self.build_actor(self.normalized_observe_Input_, scope='target', trainable=False, a_space=a_space) # Target policy smoothing, by adding clipped noise to target actions if noise_target_action: epsilon = tf.random_normal(tf.shape(self.action_), stddev=0.007) epsilon = tf.clip_by_value(epsilon, -0.01, 0.01) a2 = self.action_ + epsilon noised_action_ = tf.clip_by_value(a2, -1, 1) else: noised_action_ = self.action_ with tf.variable_scope('Critic'): # Q值都要被clip 防止过估计. self.q_1 = tf.clip_by_value( self.build_critic(self.normalized_observe_Input, self.action, scope='eval_1', trainable=True), self.Q_value_range[0], self.Q_value_range[1]) q_1_ = self.build_critic(self.normalized_observe_Input_, noised_action_, scope='target_1', trainable=False) if self.use_TD3: q_2 = tf.clip_by_value( self.build_critic(self.normalized_observe_Input, self.action, scope='eval_2', trainable=True), self.Q_value_range[0], self.Q_value_range[1]) q_2_ = self.build_critic(self.normalized_observe_Input_, noised_action_, scope='target_2', trainable=False) # Collect networks parameters. It would make it more easily to manage them. self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval') self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target') self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_1') self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_1') if self.use_TD3: self.ce2_params = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_2') self.ct2_params = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_2') with tf.variable_scope('Soft_Update'): self.soft_replace_a = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.at_params, self.ae_params) ] self.soft_replace_c = [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct1_params, self.ce1_params) ] if self.use_TD3: self.soft_replace_c += [ tf.assign(t, (1 - TAU) * t + TAU * e) for t, e in zip(self.ct2_params, self.ce2_params) ] # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚) # TD3: critic一共有4个, 算两套 critic的误差, 秀儿. with tf.variable_scope('Critic_Lose'): if self.use_TD3: min_q_ = tf.minimum(q_1_, q_2_) else: min_q_ = q_1_ self.q_target = self.R + (1. - self.terminals1) * GAMMA * min_q_ if self.use_n_step: self.n_step_target_q = self.R + ( 1. - self.terminals1) * tf.pow( GAMMA, self.n_step_steps) * min_q_ cliped_n_step_target_q = tf.clip_by_value( self.n_step_target_q, self.Q_value_range[0], self.Q_value_range[1]) cliped_q_target = tf.clip_by_value(self.q_target, self.Q_value_range[0], self.Q_value_range[1]) self.td_error_1 = tf.abs(cliped_q_target - self.q_1) if self.use_TD3: self.td_error_2 = tf.abs(cliped_q_target - q_2) if self.use_n_step: self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q - self.q_1) if self.use_TD3: self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q - q_2) L2_regular_1 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce1_params) if self.use_TD3: L2_regular_2 = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.001), weights_list=self.ce2_params) one_step_losse_1 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_1))) * self.lambda_1_step if self.use_TD3: one_step_losse_2 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square( self.td_error_2))) * self.lambda_1_step if self.use_n_step: n_step_td_losses_1 = tf.reduce_mean( tf.multiply( self.ISWeights, tf.square( self.nstep_td_error_1))) * self.lambda_n_step c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1 if self.use_TD3: n_step_td_losses_2 = tf.reduce_mean( tf.multiply(self.ISWeights, tf.square(self.nstep_td_error_2)) ) * self.lambda_n_step c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2 else: c_loss_1 = one_step_losse_1 + L2_regular_1 if self.use_TD3: c_loss_2 = one_step_losse_2 + L2_regular_2 # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差. # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差) with tf.variable_scope('Actor_lose'): Is_worse_than_demo = self.q_1 < self.q_demo Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32) worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo), tf.int8) # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以. # 我的action本来都是很小的数. action_diffs = Is_worse_than_demo * tf.reduce_sum( self.come_from_demo * tf.square(self.action - self.action_memory), 1, keepdims=True) L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs) a_loss = -tf.reduce_mean(self.q_1) + L_BC # Setting optimizer for Actor and Critic with tf.variable_scope('Critic_Optimizer'): if self.use_TD3: self.critic_grads_1 = tf_util.flatgrad( loss=c_loss_1, var_list=self.ce1_params) self.critic_grads_2 = tf_util.flatgrad( loss=c_loss_2, var_list=self.ce2_params) self.critic_optimizer_1 = MpiAdam(var_list=self.ce1_params, beta1=0.9, beta2=0.999, epsilon=1e-08) self.critic_optimizer_2 = MpiAdam(var_list=self.ce2_params, beta1=0.9, beta2=0.999, epsilon=1e-08) else: self.critic_grads = tf_util.flatgrad( loss=c_loss_1, var_list=self.ce1_params) self.critic_optimizer = MpiAdam(var_list=self.ce1_params, beta1=0.9, beta2=0.999, epsilon=1e-08) with tf.variable_scope('Actor_Optimizer'): self.actor_grads = tf_util.flatgrad(a_loss, self.ae_params) self.actor_optimizer = MpiAdam(var_list=self.ae_params, beta1=0.9, beta2=0.999, epsilon=1e-08) with self.sess.as_default(): self._initialize(self.sess) # 保存模型 var_list = tf.global_variables() print( "var_list!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" ) for v in var_list: print(v) self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1) self.writer = tf.summary.FileWriter( "logs/" + self.experiment_name + "/DDPG_" + str(rank), self.graph) # TensorBoard summary self.a_summary = tf.summary.merge([ tf.summary.scalar('a_loss', a_loss, family='actor'), tf.summary.scalar('L_BC', L_BC, family='actor'), tf.summary.scalar('worse_than_demo', worse_than_demo, family='actor') ]) if self.use_TD3: self.c_summary = tf.summary.merge([ tf.summary.scalar('c_loss_1', c_loss_1, family='critic'), tf.summary.scalar('c_loss_2', c_loss_2, family='critic') ]) else: self.c_summary = tf.summary.merge( [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')]) # episode summary self.episode_cumulate_reward = tf.placeholder( tf.float32, name='episode_cumulate_reward') self.episoed_length = tf.placeholder( tf.int16, name='episode_cumulate_reward') self.success_or_not = tf.placeholder( tf.int8, name='episode_cumulate_reward') self.eval_episode_cumulate_reward = tf.placeholder( tf.float32, name='episode_cumulate_reward') self.eval_episoed_length = tf.placeholder( tf.int16, name='episode_cumulate_reward') self.eval_success_or_not = tf.placeholder( tf.int8, name='episode_cumulate_reward') self.episode_summary = tf.summary.merge([ tf.summary.scalar('episode_cumulate_reward', self.episode_cumulate_reward, family='episoed_result'), tf.summary.scalar('episoed_length', self.episoed_length, family='episoed_result'), tf.summary.scalar('success_or_not', self.success_or_not, family='episoed_result'), ]) self.eval_episode_summary = tf.summary.merge([ tf.summary.scalar('eval_episode_cumulate_reward', self.eval_episode_cumulate_reward, family='Eval_episoed_result'), tf.summary.scalar('eval_episoed_length', self.eval_episoed_length, family='Eval_episoed_result'), tf.summary.scalar('eval_success_or_not', self.eval_success_or_not, family='Eval_episoed_result'), ])
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, task_name=None, verbose=False): """ Learn a behavior clone policy, and return the save location :param env: (Gym Environment) the environment :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy :param dataset: (Dset or MujocoDset) the dataset manager :param optim_batch_size: (int) the batch size :param max_iters: (int) the maximum number of iterations :param adam_epsilon: (float) the epsilon value for the adam optimizer :param optim_stepsize: (float) the optimizer stepsize :param ckpt_dir: (str) the save directory, can be None for temporary directory :param task_name: (str) the save name, can be None for saving directly to the directory name :param verbose: (bool) :return: (str) the save location for the TensorFlow model """ val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space policy = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder obs_ph = policy.obs_ph action_ph = policy.pdtype.sample_placeholder([None]) stochastic_ph = policy.stochastic_ph loss = tf.reduce_mean(tf.square(action_ph - policy.ac)) var_list = policy.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph], [loss] + [tf_util.flatgrad(loss, var_list)]) tf_util.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, grad = lossandgrad(ob_expert, ac_expert, True) adam.update(grad, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = os.path.join(ckpt_dir, task_name) tf_util.save_state(savedir_fname, var_list=policy.get_variables()) return savedir_fname
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): self.grad_inverter = grad_inverter( [self.action_space.high, self.action_space.low]) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) if debug: action_ph_val = tf.Print( action_ph, [ action_ph, ], '\n\n ====================Unclipped action in: \n', summarize=-1) action_ph_val = tf.Print( action_ph_val, [], '\n ======================================== \n', summarize=-1) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) # old_logstd = old_pi.proba_distribution.logstd # new_logstd = self.policy_pi.proba_distribution.logstd # old_std = old_pi.proba_distribution.std # new_std = self.policy_pi.proba_distribution.std ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) # meankl = tf.Print(meankl, [meankl,], "kl value: ") # meankl_log = tf.Print(meankl, [old_logstd,], "high kl, old logstd value: ", summarize=-1) # meankl_log = tf.Print(meankl_log, [new_logstd,], "high kl, new logstd value: ", summarize=-1) # meankl_log = tf.Print(meankl_log, [old_std,], "high kl, old std value: ", summarize=-1) # meankl_log = tf.Print(meankl_log, [new_std,], "high kl, new std value: ", summarize=-1) # meanklvalue_ = tf.where( # tf.greater(meankl, tf.constant(1, dtype = tf.float32)), # meankl_log, # meankl) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold if debug: old_logp = old_pi.proba_distribution.logp( action_ph_val) old_mean = old_pi.proba_distribution.mode() old_std = old_pi.proba_distribution.std old_logp = tf.Print(old_logp, [ old_logp, ], '====== OLD logp: \n', summarize=-1) old_logp = tf.Print(old_logp, [], '\n', summarize=-1) old_logp = tf.Print(old_logp, [ old_mean, ], '====== OLD mean: \n', summarize=-1) old_logp = tf.Print(old_logp, [], '\n', summarize=-1) old_logp = tf.Print(old_logp, [ old_std, ], '====== OLD std: \n', summarize=-1) old_logp = tf.Print(old_logp, [], '\n', summarize=-1) now_logp = self.policy_pi.proba_distribution.logp( action_ph_val) now_mean = self.policy_pi.proba_distribution.mode() now_std = self.policy_pi.proba_distribution.std now_logp = tf.Print(now_logp, [ now_logp, ], '====== NOW logp: \n', summarize=-1) now_logp = tf.Print(now_logp, [], '\n', summarize=-1) now_logp = tf.Print(now_logp, [ now_mean, ], '====== NOW mean: \n', summarize=-1) now_logp = tf.Print(now_logp, [], '\n', summarize=-1) now_logp = tf.Print(now_logp, [ now_std, ], '====== NOW std: \n', summarize=-1) now_logp = tf.Print(now_logp, [], '\n', summarize=-1) else: now_logp = self.policy_pi.proba_distribution.logp( action_ph) old_logp = old_pi.proba_distribution.logp(action_ph) ratio = tf.exp(now_logp - old_logp) if debug: ratio = tf.Print(ratio, [ ratio, ], 'ratio: \n', summarize=-1) ratio = tf.Print(ratio, [], '\n', summarize=-1) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] # losses = [pol_surr, pol_entpen, vf_loss, meanklvalue_, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.summary.scalar('entropy_loss', pol_entpen) tf.summary.scalar('policy_gradient_loss', pol_surr) tf.summary.scalar('value_function_loss', vf_loss) tf.summary.scalar('approximate_kullback-leibler', meankl) tf.summary.scalar('clip_factor', clip_param) tf.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.optim_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('clip_range', tf.reduce_mean(self.clip_param)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.optim_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.summary.image('observation', obs_ph) else: tf.summary.histogram('observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) # Construct network for new policy with tf.variable_scope("pi", reuse=False): self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False) # Target advantage function (if applicable) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean(tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] self.params = tf_util.get_trainable_vars("pi") self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses + [tf_util.flatgrad(total_loss, self.params)]) self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for ( oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("pi")) ]) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess)
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.compat.v1.variable_scope("oldpi", reuse=False): old_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.compat.v1.variable_scope("loss", reuse=False): # Target advantage function (if applicable) atarg = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ret = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None]) # learning rate multiplier, updated with schedule lrmult = tf.compat.v1.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # Annealed cliping parameter epislon clip_param = self.clip_param * lrmult obs_ph = self.policy_pi.obs_ph action_ph = self.policy_pi.pdtype.sample_placeholder( [None]) kloldnew = old_pi.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(input_tensor=kloldnew) meanent = tf.reduce_mean(input_tensor=ent) pol_entpen = (-self.entcoeff) * meanent # pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action_ph) - old_pi.proba_distribution.logp(action_ph)) # surrogate from conservative policy iteration surr1 = ratio * atarg surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # PPO's pessimistic surrogate (L^CLIP) pol_surr = -tf.reduce_mean( input_tensor=tf.minimum(surr1, surr2)) vf_loss = tf.reduce_mean( input_tensor=tf.square(self.policy_pi.value_flat - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] self.loss_names = [ "pol_surr", "pol_entpen", "vf_loss", "kl", "ent" ] tf.compat.v1.summary.scalar('entropy_loss', pol_entpen) tf.compat.v1.summary.scalar('policy_gradient_loss', pol_surr) tf.compat.v1.summary.scalar('value_function_loss', vf_loss) tf.compat.v1.summary.scalar('approximate_kullback-leibler', meankl) tf.compat.v1.summary.scalar('clip_factor', clip_param) tf.compat.v1.summary.scalar('loss', total_loss) self.params = tf_util.get_trainable_vars("model") self.assign_old_eq_new = tf_util.function( [], [], updates=[ tf.compat.v1.assign(oldv, newv) for (oldv, newv) in zipsame( tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model")) ]) with tf.compat.v1.variable_scope("Adam_mpi", reuse=False): self.adam = MpiAdam(self.params, epsilon=self.adam_epsilon, sess=self.sess) with tf.compat.v1.variable_scope("input_info", reuse=False): tf.compat.v1.summary.scalar( 'discounted_rewards', tf.reduce_mean(input_tensor=ret)) tf.compat.v1.summary.scalar( 'learning_rate', tf.reduce_mean(input_tensor=self.optim_stepsize)) tf.compat.v1.summary.scalar( 'advantage', tf.reduce_mean(input_tensor=atarg)) tf.compat.v1.summary.scalar( 'clip_range', tf.reduce_mean(input_tensor=self.clip_param)) if self.full_tensorboard_log: tf.compat.v1.summary.histogram('discounted_rewards', ret) tf.compat.v1.summary.histogram('learning_rate', self.optim_stepsize) tf.compat.v1.summary.histogram('advantage', atarg) tf.compat.v1.summary.histogram('clip_range', self.clip_param) if tf_util.is_image(self.observation_space): tf.compat.v1.summary.image('observation', obs_ph) else: tf.compat.v1.summary.histogram( 'observation', obs_ph) self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state tf_util.initialize(sess=self.sess) self.summary = tf.compat.v1.summary.merge_all() self.lossandgrad = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses) self.compute_losses = tf_util.function( [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses)
def setup_model(self): # prevent import loops from stable_baselines.gail.adversary import TransitionClassifier with SetVerbosity(self.verbose): assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \ "an instance of common.policies.ActorCriticPolicy." self.nworkers = MPI.COMM_WORLD.Get_size() self.rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_util.single_threaded_session(graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier( self.observation_space, self.action_space, self.hidden_size_adversary, entcoeff=self.adversary_entcoeff) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) # Network for old policy with tf.variable_scope("oldpi", reuse=False): old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, None, reuse=False, **self.policy_kwargs) with tf.variable_scope("loss", reuse=False): atarg = tf.placeholder(dtype=tf.float32, shape=[ None ]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return observation = self.policy_pi.obs_ph action = self.policy_pi.pdtype.sample_placeholder([None]) kloldnew = old_policy.proba_distribution.kl( self.policy_pi.proba_distribution) ent = self.policy_pi.proba_distribution.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = self.entcoeff * meanent vferr = tf.reduce_mean( tf.square(self.policy_pi.value_fn[:, 0] - ret)) # advantage * pnew / pold ratio = tf.exp( self.policy_pi.proba_distribution.logp(action) - old_policy.proba_distribution.logp(action)) surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] self.loss_names = [ "optimgain", "meankl", "entloss", "surrgain", "entropy" ] dist = meankl all_var_list = tf_util.get_trainable_vars("model") var_list = [ v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name ] vf_var_list = [ v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name ] self.get_flat = tf_util.GetFlat(var_list, sess=self.sess) self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: var_size = tf_util.intprod(shape) tangents.append( tf.reshape(flat_tangent[start:start + var_size], shape)) start += var_size gvp = tf.add_n([ tf.reduce_sum(grad * tangent) for (grad, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = tf_util.flatgrad(gvp, var_list) tf.summary.scalar('entropy_loss', meanent) tf.summary.scalar('policy_gradient_loss', optimgain) tf.summary.scalar('value_function_loss', surrgain) tf.summary.scalar('approximate_kullback-leiber', meankl) tf.summary.scalar( 'loss', optimgain + meankl + entbonus + surrgain + meanent) self.assign_old_eq_new = \ tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars("oldpi"), tf_util.get_globals_vars("model"))]) self.compute_losses = tf_util.function( [observation, old_policy.obs_ph, action, atarg], losses) self.compute_fvp = tf_util.function([ flat_tangent, observation, old_policy.obs_ph, action, atarg ], fvp) self.compute_vflossandgrad = tf_util.function( [observation, old_policy.obs_ph, ret], tf_util.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if self.rank == 0 and self.verbose >= 1: print(colorize(msg, color='magenta')) start_time = time.time() yield print( colorize("done in {:.3f} seconds".format( (time.time() - start_time)), color='magenta')) else: yield def allmean(arr): assert isinstance(arr, np.ndarray) out = np.empty_like(arr) MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM) out /= self.nworkers return out tf_util.initialize(sess=self.sess) th_init = self.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) self.set_from_flat(th_init) with tf.variable_scope("Adam_mpi", reuse=False): self.vfadam = MpiAdam(vf_var_list, sess=self.sess) if self.using_gail: self.d_adam = MpiAdam( self.reward_giver.get_trainable_variables(), sess=self.sess) self.d_adam.sync() self.vfadam.sync() with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize)) tf.summary.scalar('advantage', tf.reduce_mean(atarg)) tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl)) if self.full_tensorboard_log: tf.summary.histogram('discounted_rewards', ret) tf.summary.histogram('learning_rate', self.vf_stepsize) tf.summary.histogram('advantage', atarg) tf.summary.histogram('kl_clip_range', self.max_kl) if tf_util.is_image(self.observation_space): tf.summary.image('observation', observation) else: tf.summary.histogram('observation', observation) self.timed = timed self.allmean = allmean self.step = self.policy_pi.step self.proba_step = self.policy_pi.proba_step self.initial_state = self.policy_pi.initial_state self.params = find_trainable_variables("model") if self.using_gail: self.params.extend( self.reward_giver.get_trainable_variables()) self.summary = tf.summary.merge_all() self.compute_lossandgrad = \ tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
def general_actor_critic(input_shape_vec, act_output_shape, comm, learn_rate=[0.001, 0.001], trainable=True, label=""): sess = K.get_session() np.random.seed(0) tf.set_random_seed(0) # network 1 (new policy) with tf.variable_scope(label + "_pi_new", reuse=False): inp = Input(shape=input_shape_vec) # [5,6,3] # rc_lyr = Lambda(lambda x: ned_to_ripCoords_tf(x, 4000))(inp) trunk_x = Reshape([input_shape_vec[0], input_shape_vec[1] * 3])(inp) trunk_x = LSTM(128)(trunk_x) dist, sample_action_op, action_ph, value_output = ppo_continuous( 3, trunk_x) # network 2 (old policy) with tf.variable_scope(label + "_pi_old", reuse=False): inp_old = Input(shape=input_shape_vec) # [5,6,3] # rc_lyr = Lambda(lambda x: ned_to_ripCoords_tf(x, 4000))(inp_old) trunk_x = Reshape([input_shape_vec[0], input_shape_vec[1] * 3])(inp_old) trunk_x = LSTM(128)(trunk_x) dist_old, sample_action_op_old, action_ph_old, value_output_old = ppo_continuous( 3, trunk_x) # additional placeholders adv_ph = tf.placeholder(tf.float32, [None], name="advantages_ph") alpha_ph = tf.placeholder(tf.float32, shape=(), name="alpha_ph") vtarg = tf.placeholder(tf.float32, [None]) # target value placeholder # loss loss = ppo_continuous_loss(dist, dist_old, value_output, action_ph, alpha_ph, adv_ph, vtarg) # gradient with tf.variable_scope("grad", reuse=False): gradient = tf_util.flatgrad( loss, tf_util.get_trainable_vars(label + "_pi_new")) adam = MpiAdam(tf_util.get_trainable_vars(label + "_pi_new"), epsilon=0.00001, sess=sess, comm=comm) # method for sync'ing the two policies assign_old_eq_new = tf_util.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(tf_util.get_globals_vars(label + "_pi_old"), tf_util.get_globals_vars(label + "_pi_new")) ]) # initialize all the things init_op = tf.global_variables_initializer() sess.run(init_op) # methods for interacting with this model def sync_weights(): assign_old_eq_new(sess=sess) def sample_action(states, logstd_override=None): a = sess.run(sample_action_op, feed_dict={inp: states}) return a def sample_value(states): v = sess.run(value_output, feed_dict={inp: states}) return v def train(states, actions, vtarget, advs, alpha): alpha = max(alpha, 0.0) adam_lr = learn_rate[0] g = sess.run( [gradient], feed_dict={ inp: states, inp_old: states, action_ph: actions, adv_ph: advs, alpha_ph: alpha, vtarg: vtarget }) adam.update(g[0], adam_lr * alpha) # initial sync adam.sync() sync_weights() return sync_weights, sample_action, sample_value, train