def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, env, hidden_size, expert_dataset): with tf.variable_scope('guidance'): self.hidden_size = hidden_size self.scope = tf.get_variable_scope().name self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.build_ph() # Build grpah # 构建图(生成图和专家图)输出1维 generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.generator_obs_ph, self.expert_acs_ph, reuse=True) self.expert_label = tf.ones_like(expert_logits) self.generator_label = tf.zeros_like(generator_logits) # label label = self.generator_label >= self.expert_label label = (tf.cast(label, tf.float32) - 0.5) * 2 loss = RankLoss(predict_score1=generator_logits, predict_score2=expert_logits, label=label) self.loss = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer() self.train_op = optimizer.minimize(self.loss) # Build Reward for policy 为什么用生成器作为reward_op self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) self.loss_name = ["guidance__loss"] var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.generator_obs_ph, self.generator_acs_ph, self.expert_acs_ph], [self.loss] + [U.flatgrad(self.loss, var_list)])
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable(dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') updates = [ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ] self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=updates)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) self.obs = ob with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) self.v_preds = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] self.pd, self.pi = pdtype.pdfromlatent(last_out) # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): # mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) # logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) # else: # pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) # self.pd = pdtype.pdfromflat(pdparam) # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.v_preds])
def learn(env, policy_func, reward_giver, reward_guidance, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, task_name, gamma, lam, algo, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=1e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, loss_percent=0.0, callback=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space policy = build_policy(env, 'mlp', value_network='copy') ob = observation_placeholder(ob_space) with tf.variable_scope('pi'): pi = policy(observ_placeholder=ob) with tf.variable_scope('oldpi'): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables('pi') # var_list = [v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")] # vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) guidance_adam = MpiAdam(reward_guidance.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables('oldpi'), get_variables('pi')) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() guidance_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, reward_guidance, timesteps_per_batch, stochastic=True, algo=algo, loss_percent=loss_percent) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Iteration %i ************" % iters_so_far) # global flag_render # if iters_so_far > 0 and iters_so_far % 10 ==0: # flag_render = True # else: # flag_render = False def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() print('rewards', seg['rew']) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=len(ob)) batch_size = 128 d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Discriminator"): for (ob_batch, ac_batch) in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ob_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # ------------------ Update Guidance ------------ logger.log("Optimizing Guidance...") logger.log(fmt_row(13, reward_guidance.loss_name)) batch_size = 128 guidance_losses = [ ] # list of tuples, each of which gives the loss for a minibatch with timed("Guidance"): for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch( batch_size=batch_size) idx_condition = process_expert(ob_expert, ac_expert) pick_idx = (idx_condition >= loss_percent) # pick_idx = idx_condition ob_expert_p = ob_expert[pick_idx] ac_expert_p = ac_expert[pick_idx] ac_batch_p = [] for each_ob in ob_expert_p: tmp_ac, _, _, _ = pi.step(each_ob, stochastic=True) ac_batch_p.append(tmp_ac) # update running mean/std for reward_giver if hasattr(reward_guidance, "obs_rms"): reward_guidance.obs_rms.update(ob_expert_p) # reward_guidance.train(expert_s=ob_batch_p, agent_a=ac_batch_p, expert_a=ac_expert_p) *newlosses, g = reward_guidance.lossandgrad( ob_expert_p, ac_batch_p, ac_expert_p) guidance_adam.update(allmean(g), d_stepsize) guidance_losses.append(newlosses) logger.log(fmt_row(13, np.mean(guidance_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) * g_step iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def __init__(self, observations, action_space, latent, optimizer=None, sess=None, train=True, beta=1.0, l2=0., lr=0.001, init_scale=0.01, init_bias=0.0, trainable_variance=True, trainable_bias=True, init_logstd=0., scope_name="pi", clip=None, state_dependent_variance=True, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) latent = tf.layers.flatten(latent) self.action_space = action_space self.pdtype = make_pdtype(action_space) self.pd, self.pi = self.pdtype.pdfromlatent( latent, init_scale=init_scale, init_bias=init_bias, trainable_variance=trainable_variance, trainable_bias=trainable_bias, init_logstd=init_logstd, clip=clip, beta=beta) # init_bias=0.0 self.stochastic = tf.placeholder(dtype=tf.bool, shape=()) self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self.neglogp = self.pd.neglogp(self.action) if beta == 1.0: self.prob = tf.nn.softmax(self.pd.flatparam()) else: self.prob = boltzmann(self.pd.flatparam(), beta=beta) if optimizer is None: self.optimizer = tf.train.AdamOptimizer(learning_rate=lr) else: self.optimizer = optimizer self.sess = sess or tf.get_default_session() self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope_name) try: self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder') self.action_selected = action_selected = tf.one_hot( self.action_ph, self.action_space.n) #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1)) out = tf.reduce_mean( tf.log(tf.reduce_sum(self.prob * action_selected, axis=1))) gradients = tf.gradients(out, self.vars) except: self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None, ) + action_space.shape, name='targets_placeholder') gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) self.cont = cont = not isinstance(self.action_space, Discrete) flat_grad = tf_util.GetFlat(gradients).op self.compute_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=[flat_grad]) '''self.compute_cont_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) )''' self.debug = tf_util.function( inputs=[self.X, self.action_ph], outputs=[gradients, self.prob, self.action_ph]) self.set_from_flat = tf_util.SetFromFlat(self.vars) if self.cont: total_error = tf.reduce_sum( tf.square(self.action_ph - tf.reduce_mean(self.action_ph, axis=0)), axis=0) unexplained_error = tf.reduce_sum(tf.square(self.action_ph - self.pd.mean), axis=0) R_squared = 1 - (unexplained_error / total_error) self.accuracy = accuracy = R_squared else: self.accuracy = accuracy = tf.reduce_mean( tf.cast(tf.math.equal(self.pd.mode(), self.action_ph), tf.float32)) self.entropy = entropy = tf.reduce_mean(self.pd.entropy()) if train: self.gamma = l2 self._build_train( cont=cont, state_dependent_variance=state_dependent_variance) self.pdf = tf.exp(self.pd.logp(self.action_ph))
def learn(comm, env, bc_agent_wrapper, experiment_name, ckpt_dir, summary_dir, expert_dataset, lr, batch_size, max_iters): rank = comm.Get_rank() # Create the BC agent pol = bc_agent_wrapper('pol') # Create mpi adam optimizer for the policy pol_optimizer = MpiAdamOptimizer(comm, clip_norm=pol.hps.clip_norm, learning_rate=lr, name='pol_adam') _optimize_pol = pol_optimizer.minimize(pol.loss, var_list=pol.trainable_vars) # Retrieve already-existing placeholders e_obs = U.get_placeholder_cached(name='e_obs') e_acs = U.get_placeholder_cached(name='e_acs') # Create Theano-like ops optimize_pol = U.function([e_obs, e_acs], _optimize_pol) # Initialize variables U.initialize() # Sync params of all processes with the params of the root process pol_optimizer.sync_from_root(pol.trainable_vars) if rank == 0: # Create summary writer writer = U.file_writer(summary_dir) # Create the summary _names = ['train_loss', 'val_loss'] _summary = CustomSummary(scalar_keys=_names, family="bc") # Define the origin of time tstart = time.time() # Define rolling buffers for loss collection maxlen = 100 pol_train_loss_buffer = deque(maxlen=maxlen) pol_val_loss_buffer = deque(maxlen=maxlen) for iters_so_far in range(max_iters): # Verify that the processes are still in sync if iters_so_far > 0 and iters_so_far % 10 == 0: pol_optimizer.check_synced(pol.trainable_vars) # Save the model if rank == 0 and iters_so_far % int(1e4) == 0 and ckpt_dir is not None: model_path = osp.join(ckpt_dir, experiment_name) U.save_state(model_path, iters_so_far=iters_so_far) logger.info("saving model") logger.info(" @: {}".format(model_path)) # Make non-zero-rank workers wait for rank zero comm.Barrier() # Go through mini-batches of the demonstration dataset, training fraction obs, acs = expert_dataset.get_next_pair_batch(batch_size, 'train') # Update running mean and std on states if hasattr(pol, "obs_rms"): pol.obs_rms.update(obs, comm) # Perform a gradient step to update the policy parameters optimize_pol(obs, acs) # Compute training loss pol_train_loss = pol.compute_pol_loss(obs, acs) pol_train_loss_buffer.append(pol_train_loss) # Go through mini-batches of the demonstration dataset, validation fraction obs, acs = expert_dataset.get_next_pair_batch(-1, 'val') # Compute validation loss pol_val_loss = pol.compute_pol_loss(obs, acs) pol_val_loss_buffer.append(pol_val_loss) if iters_so_far % 100 == 0: # Log training and validation losses logger.info( ('iter #{} ' '| train loss: {} ' '| val loss: {} ' '| elapsed: {}').format(iters_so_far, pol_train_loss, pol_val_loss, prettify_time(time.time() - tstart))) # Prepare losses to be dumped in summaries all_summaries = [ np.mean(pol_train_loss_buffer), np.mean(pol_val_loss_buffer) ] # must be visible by all workers if rank == 0: assert len(_names) == len( all_summaries), "mismatch in list lengths" _summary.add_all_summaries(writer, all_summaries, iters_so_far)
def learn( *, network, env, eval_policy, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, checkpoint_path_in=None, checkpoint_dir_out=None, checkpoint_freq=100, # In iterations!!, from_iter=0, eval_episodes=20, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,)) ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # Loading checkpoint if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in): pi.load(checkpoint_path_in) logger.log('Loaded policy weights from %s' % checkpoint_path_in) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() # s = env.reset() # start = time.time() # for i in range(10000): # pi.step(s, stochastic=True) # duration = time.time() - start # print(duration) # return if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, gamma=gamma) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 iters_eval = 0 all_logs = [] best_rew = -np.inf tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards online_scores = [] offline_scores = [] if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None: if not os.path.exists(checkpoint_dir_out): os.makedirs(checkpoint_dir_out) pi.save( os.path.join(checkpoint_dir_out, 'checkpoint_%d' % iters_so_far)) logger.log('Saved policy weights as %s' % os.path.join( checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far)) def pi_wrapper(ob): ac, vpred, _, _ = pi.step(ob, stochastic=True) return ac rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy( pi=pi_wrapper, n_episodes=eval_episodes, verbose=True) offline_scores.append( [np.mean(disc_rets), np.mean(num_stops), np.mean(avg_damages)]) np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'), offline_scores) for log in logs: log['iter'] = iters_eval all_logs = all_logs + logs iters_eval += 1 with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) ep_rew_mean = np.mean(rewbuffer) online_scores.append(ep_rew_mean) np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'), online_scores) # Saving best if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None: pi.save(os.path.join(checkpoint_dir_out, 'best')) best_rew = ep_rew_mean logger.log('Saved policy weights as %s' % os.path.join(checkpoint_dir_out, 'best.npy')) if rank == 0: logger.dump_tabular() return pi
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation entcoeff=0.0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) args): # Setup losses and stuff` # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy # Ops to reassign params from new to old assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent newprob = tf.exp(pi.pd.logp(ac)) oldprob = tf.exp(oldpi.pd.logp(ac)) ratio = newprob / oldprob kl = pi.pd.kl(oldpi.pd) mean_kl = tf.reduce_mean(kl) get_kl = U.function([ob, ac], kl) get_mean_kl = U.function([ob, ac], mean_kl) threshold = kl < args.kl_threshold threshold = tf.cast(threshold, tf.float32) pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold pol_surr = tf.reduce_mean(pol_surr) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards running_scores = [] assert sum([ max_iters > 0, args.num_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if args.num_timesteps and timesteps_so_far >= args.num_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / args.num_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-8) # standardized advantage function estimate optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Here we do a bunch of optimization epochs over the data for num_epoch in count(): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) g = np.nan_to_num(g) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) agg_mean_kl = get_mean_kl(ob, ac) if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs: break lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) rewbuffer.extend(rews) mean_score = None if rewbuffer: mean_score = np.mean(rewbuffer) running_scores.append((timesteps_so_far, mean_score)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if MPI.COMM_WORLD.Get_rank() == 0: logger.record_tabular("EpRewMean", mean_score) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("NumEpoch", num_epoch) logger.dump_tabular() return running_scores
def __init__(self, env, hidden_size, expert_dataset): self.hidden_size = hidden_size self.expert_dataset = expert_dataset with tf.variable_scope('guidance'): self.scope = tf.get_variable_scope().name self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape), name='ph_agent_s') self.agent_a = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.action_space.shape), name='ph_agent_a') self.expert_a = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.action_space.shape), name='ph_expert_a') with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd( shape=env.observation_space.shape) obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std layer_s = tf.layers.dense(inputs=obs_ph_rms, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer_s') layer_a = tf.layers.dense(inputs=self.agent_a, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer_a') layer_s_a = tf.concat([layer_s, layer_a], axis=1) layer = tf.layers.dense(inputs=layer_s_a, units=self.hidden_size, activation=tf.nn.leaky_relu, name='layer1') output = tf.layers.dense(inputs=layer, units=env.action_space.shape[0], activation=tf.identity, name='layer2') ########## # BUG ########## # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty labels = tf.nn.softmax(self.expert_a) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=output)) optimizer = tf.train.AdamOptimizer() self.train_op = optimizer.minimize(self.loss) self.loss_name = ["guidance_loss"] var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.agent_s, self.agent_a, self.expert_a], [self.loss] + [U.flatgrad(self.loss, var_list)])
def __init__(self, env, hidden_size, expert_dataset): self.obs = expert_dataset.inputs self.acs = expert_dataset.labels with tf.variable_scope('guidance'): self.scope = tf.get_variable_scope().name self.agent_s = tf.placeholder(dtype=tf.float32, shape=[None] + list(env.observation_space.shape), name='ph_agent_s') self.agent_a = tf.placeholder(dtype=tf.int32, shape=[None], name='ph_agent_a') agent_a_one_hot = tf.one_hot(self.agent_a, depth=env.action_space.n) self.expert_a = tf.placeholder(dtype=tf.int32, shape=[None], name='ph_expert_a') expert_a_one_hot = tf.one_hot(self.expert_a, depth=env.action_space.n) with tf.variable_scope("obfilter"): self.obs_rms = RunningMeanStd( shape=env.observation_space.shape) obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std layer_s = tf.layers.dense(inputs=obs_ph_rms, units=hidden_size, activation=tf.nn.leaky_relu, name='layer_s') layer_a = tf.layers.dense(inputs=agent_a_one_hot, units=hidden_size, activation=tf.nn.leaky_relu, name='layer_a') layer_s_a = tf.concat([layer_s, layer_a], axis=1) layer = tf.layers.dense(inputs=layer_s_a, units=hidden_size, activation=tf.nn.leaky_relu, name='layer1') output = tf.layers.dense(inputs=layer, units=env.action_space.n, activation=tf.nn.softmax, name='layer2') loss = tf.keras.losses.categorical_crossentropy( y_true=expert_a_one_hot, y_pred=output) # loss = tf.nn.softmax_cross_entropy_with_logits(labels=expert_a_one_hot, logits=output) self.loss = tf.reduce_mean(loss) ########## # BUG ########## # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty # self.loss = loss_func(structured_generator_inputs=output, predicted_distributions=expert_a_one_hot) optimizer = tf.train.AdamOptimizer() self.train_op = optimizer.minimize(self.loss) self.loss_name = ["guidance_loss"] var_list = self.get_trainable_variables() self.lossandgrad = U.function( [self.agent_s, self.agent_a, self.expert_a], [self.loss] + [U.flatgrad(self.loss, var_list)])
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None,trainable_variance=True,init_logstd=0, clip=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01, trainable_variance=trainable_variance, init_logstd=init_logstd, clip=clip) self.stochastic = tf.placeholder(dtype=tf.bool, shape=()) self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode()) self.neglogp = self.pd.neglogp(self.action) self.logits=tf.nn.softmax(self.pd.flatparam()) self.sess = sess self.prob = tf.nn.softmax(self.pd.flatparam()) #out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob * action_selected, axis=1))) self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi/pi") if len(self.vars) == 0: self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="pi") self.set_from_flat = tf_util.SetFromFlat(self.vars) try: self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder') self.action_selected = action_selected = tf.one_hot(self.action_ph, env.action_space.n) #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1)) out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1))) gradients = tf.gradients(out, self.vars) except: self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + env.action_space.shape, name='targets_placeholder') gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars) #gradients = tf.gradients(out, self.vars) if gradients[0] is not None: flat_grad = tf_util.GetFlat(gradients).op self.compute_gradients = tf_util.function( inputs=[self.X, self.action_ph], outputs=[flat_grad] ) if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:,0]