def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) # critic_reg = tc.layers.apply_regularization( # tc.layers.l2_regularizer(self.critic_l2_reg), # weights_list=critic_reg_vars critic_reg = self.critic_l2_reg # critic_reg = tf.layers.l2_regularizer(self.critic_l2_reg) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08)
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via tensorflow_code-pytorch.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
class DDPG(object): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, ) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None, ) + (action_shape, ), name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize( tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize( tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize( target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.saver = tf.train.Saver() def setup_target_network_updates(self): actor_init_updates, actor_soft_updates = get_target_updates( self.actor.vars, self.target_actor.vars, self.tau) critic_init_updates, critic_soft_updates = get_target_updates( self.critic.vars, self.target_critic.vars, self.tau) self.target_init_updates = [actor_init_updates, critic_init_updates] self.target_soft_updates = [actor_soft_updates, critic_soft_updates] def setup_param_noise(self, normalized_obs0): assert self.param_noise is not None # Configure perturbed actor. param_noise_actor = copy(self.actor) param_noise_actor.name = 'param_noise_actor' self.perturbed_actor_tf = param_noise_actor(normalized_obs0) logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates( self.actor, param_noise_actor, self.param_noise_stddev) # Configure separate copy for stddev adoption. adaptive_param_noise_actor = copy(self.actor) adaptive_param_noise_actor.name = 'adaptive_param_noise_actor' adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates( self.actor, adaptive_param_noise_actor, self.param_noise_stddev) self.adaptive_policy_distance = tf.sqrt( tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf))) def setup_actor_optimizer(self): logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [ var.get_shape().as_list() for var in self.actor.trainable_vars ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_critic_optimizer(self): logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean( tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name ] for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) # critic_reg = tc.layers.apply_regularization( # tc.layers.l2_regularizer(self.critic_l2_reg), # weights_list=critic_reg_vars critic_reg = self.critic_l2_reg # critic_reg = tf.layers.l2_regularizer(self.critic_l2_reg) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in self.critic.trainable_vars ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars, beta1=0.9, beta2=0.999, epsilon=1e-08) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [ M.assign(M * self.old_std / new_std) ] self.renormalize_Q_outputs_op += [ b.assign( (b * self.old_std + self.old_mean - new_mean) / new_std) ] def setup_stats(self): ops = [] names = [] if self.normalize_returns: ops += [self.ret_rms.mean, self.ret_rms.std] names += ['ret_rms_mean', 'ret_rms_std'] if self.normalize_observations: ops += [ tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std) ] names += ['obs_rms_mean', 'obs_rms_std'] ops += [tf.reduce_mean(self.critic_tf)] names += ['reference_Q_mean'] ops += [reduce_std(self.critic_tf)] names += ['reference_Q_std'] ops += [tf.reduce_mean(self.critic_with_actor_tf)] names += ['reference_actor_Q_mean'] ops += [reduce_std(self.critic_with_actor_tf)] names += ['reference_actor_Q_std'] ops += [tf.reduce_mean(self.actor_tf)] names += ['reference_action_mean'] ops += [reduce_std(self.actor_tf)] names += ['reference_action_std'] if self.param_noise: ops += [tf.reduce_mean(self.perturbed_actor_tf)] names += ['reference_perturbed_action_mean'] ops += [reduce_std(self.perturbed_actor_tf)] names += ['reference_perturbed_action_std'] self.stats_ops = ops self.stats_names = names def step(self, obs, sigma, apply_noise=True, compute_Q=True): if self.param_noise is not None and apply_noise: actor_tf = self.perturbed_actor_tf else: actor_tf = self.actor_tf feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])} if compute_Q: action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict) else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None if self.action_noise is not None and apply_noise: noise = self.action_noise(sigma) # assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale # B = obs0.shape[0] # for b in range(B): # self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) # if self.normalize_observations: # self.obs_rms.update(np.array([obs0[b]])) self.memory.append(obs0, action, reward, obs1, terminal1) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) # Run sanity check. Disabled by default since it slows down things considerably. # print('running sanity check') # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={ # self.obs1: batch['obs1'], # self.rewards: batch['rewards'], # self.terminals1: batch['terminals1'].astype('float32'), # }) # print(target_Q_new, target_Q, new_mean, new_std) # assert (np.abs(target_Q - target_Q_new) < 1e-3).all() else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def train_fake_data(self, batch): # Get a batch. # batch = self.memory.sample(batch_size=self.batch_size) if self.normalize_returns and self.enable_popart: old_mean, old_std, target_Q = self.sess.run( [self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) self.ret_rms.update(target_Q.flatten()) self.sess.run(self.renormalize_Q_outputs_op, feed_dict={ self.old_std: np.array([old_std]), self.old_mean: np.array([old_mean]), }) else: target_Q = self.sess.run(self.target_Q, feed_dict={ self.obs1: batch['obs1'], self.rewards: batch['rewards'], self.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [ self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss ] actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run( ops, feed_dict={ self.obs0: batch['obs0'], self.actions: batch['actions'], self.critic_target: target_Q, }) self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr) self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr) return critic_loss, actor_loss def initialize(self, sess): self.sess = sess self.sess.run(tf.global_variables_initializer()) self.actor_optimizer.sync() self.critic_optimizer.sync() self.sess.run(self.target_init_updates) def update_target_net(self): self.sess.run(self.target_soft_updates) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) values = self.sess.run(self.stats_ops, feed_dict={ self.obs0: self.stats_sample['obs0'], self.actions: self.stats_sample['actions'], }) names = self.stats_names[:] assert len(names) == len(values) stats = dict(zip(names, values)) if self.param_noise is not None: stats = {**stats, **self.param_noise.get_stats()} return stats def adapt_param_noise(self): if self.param_noise is None: return 0. # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. batch = self.memory.sample(batch_size=self.batch_size) self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) distance = self.sess.run(self.adaptive_policy_distance, feed_dict={ self.obs0: batch['obs0'], self.param_noise_stddev: self.param_noise.current_stddev, }) mean_distance = MPI.COMM_WORLD.allreduce( distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: self.sess.run(self.perturb_policy_ops, feed_dict={ self.param_noise_stddev: self.param_noise.current_stddev, }) def store(self, path): self.saver = self.saver.save(self.sess, path) def restore(self, sess, path, name): self.saver = tf.train.import_meta_graph(path + name + '.meta') self.saver.restore(sess, tf.train.latest_checkpoint(path)) self.sess = sess