def __init__(self, epsilon=1e-4, shape=(), scope=''): sess = get_session() self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64) self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) self._new_count = tf.placeholder(shape=(), dtype=tf.float64) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self._mean = tf.get_variable('mean', initializer=np.zeros( shape, 'float64'), dtype=tf.float64) self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) self.update_ops = tf.group([ self._var.assign(self._new_var), self._mean.assign(self._new_mean), self._count.assign(self._new_count) ]) sess.run(tf.variables_initializer([self._mean, self._var, self._count])) self.sess = sess self._set_mean_var_count()
def get_input_layer(dim_input, dim_output): """produce the placeholder inputs that are used to run ops forward and backwards. net_input: usually an observation. action: mu, the ground truth actions we're trying to learn. precision: precision matrix used to commpute loss.""" net_input = tf.placeholder("float", [None, dim_input], name='nn_input') action = tf.placeholder('float', [None, dim_output], name='action') precision = tf.placeholder('float', [None, dim_output, dim_output], name='precision') return net_input, action, precision
def test_function(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_multikwargs(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def __init__(self, dU, obs_tensor, act_op, feat_op, var, sess, device_string, copy_param_scope=None): Policy.__init__(self) self.dU = dU self.obs_tensor = obs_tensor self.act_op = act_op self.feat_op = feat_op self.sess = sess self.device_string = device_string self.chol_pol_covar = np.diag(np.sqrt(var)) self.scale = None # must be set from elsewhere based on observations self.bias = None self.x_idx = None if copy_param_scope: self.copy_params = tf.get_collection(tf.GraphKeys.VARIABLES, scope=copy_param_scope) self.copy_params_assign_placeholders = [ tf.placeholder(tf.float32, shape=param.get_shape()) for param in self.copy_params ] self.copy_params_assign_ops = [ tf.assign(self.copy_params[i], self.copy_params_assign_placeholders[i]) for i in range(len(self.copy_params)) ]
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std') new_std = self.ret_rms.std self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean') new_mean = self.ret_rms.mean self.renormalize_Q_outputs_op = [] for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)] self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1, critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Inputs. self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1') self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1') self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions = tf.placeholder(tf.float32, shape=(None,) + (action_shape,), name='actions') self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target') self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev') # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.critic = critic self.actor = actor self.actor_lr = actor_lr self.critic_lr = critic_lr self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg # Observation normalization. if self.normalize_observations: with tf.variable_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) # Return normalization. if self.normalize_returns: with tf.variable_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. target_actor = copy(actor) target_actor.name = 'target_actor' self.target_actor = target_actor target_critic = copy(critic) target_critic.name = 'target_critic' self.target_critic = target_critic # Create networks and core TF parts that are shared across setup parts. self.actor_tf = actor(normalized_obs0) self.normalized_critic_tf = critic(normalized_obs0, self.actions) self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True) self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms) self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1 # Set up parts. if self.param_noise is not None: self.setup_param_noise(normalized_obs0) self.setup_actor_optimizer() self.setup_critic_optimizer() if self.normalize_returns and self.enable_popart: self.setup_popart() self.setup_stats() self.setup_target_network_updates() self.initial_state = None # recurrent architectures not supported yet self.saver = tf.train.Saver()
def sample_placeholder(self, prepend_shape, name=None): return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape + self.sample_shape(), name=name)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via tensorflow_code-pytorch.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
def __init__(self, size, eps=1e-2, default_clip_range=np.inf, sess=None): """A normalizer that ensures that observations are approximately distributed according to a standard Normal distribution (i.e. have mean zero and variance one). Args: size (int): the size of the observation to be normalized eps (float): a small constant that avoids underflows default_clip_range (float): normalized observations are clipped to be in [-default_clip_range, default_clip_range] sess (object): the TensorFlow session to be used """ self.size = size self.eps = eps self.default_clip_range = default_clip_range self.sess = sess if sess is not None else tf.get_default_session() self.local_sum = np.zeros(self.size, np.float32) self.local_sumsq = np.zeros(self.size, np.float32) self.local_count = np.zeros(1, np.float32) self.sum_tf = tf.get_variable(initializer=tf.zeros_initializer(), shape=self.local_sum.shape, name='sum', trainable=False, dtype=tf.float32) self.sumsq_tf = tf.get_variable(initializer=tf.zeros_initializer(), shape=self.local_sumsq.shape, name='sumsq', trainable=False, dtype=tf.float32) self.count_tf = tf.get_variable(initializer=tf.ones_initializer(), shape=self.local_count.shape, name='count', trainable=False, dtype=tf.float32) self.mean = tf.get_variable(initializer=tf.zeros_initializer(), shape=(self.size, ), name='mean', trainable=False, dtype=tf.float32) self.std = tf.get_variable(initializer=tf.ones_initializer(), shape=(self.size, ), name='std', trainable=False, dtype=tf.float32) self.count_pl = tf.placeholder(name='count_pl', shape=(1, ), dtype=tf.float32) self.sum_pl = tf.placeholder(name='sum_pl', shape=(self.size, ), dtype=tf.float32) self.sumsq_pl = tf.placeholder(name='sumsq_pl', shape=(self.size, ), dtype=tf.float32) self.update_op = tf.group(self.count_tf.assign_add(self.count_pl), self.sum_tf.assign_add(self.sum_pl), self.sumsq_tf.assign_add(self.sumsq_pl)) self.recompute_op = tf.group( tf.assign(self.mean, self.sum_tf / self.count_tf), tf.assign( self.std, tf.sqrt( tf.maximum( tf.square(self.eps), self.sumsq_tf / self.count_tf - tf.square(self.sum_tf / self.count_tf)))), ) self.lock = threading.Lock()
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, bc_loss, q_filter, num_demo, demo_batch_size, prm_loss_weight, aux_loss_weight, sample_transitions, gamma, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Added functionality to use demonstrations for training to Overcome exploration problem. Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused bc_loss: whether or not the behavior cloning loss should be used as an auxilliary loss q_filter: whether or not a filter on the q value update should be used when training with demonstartions num_demo: Number of episodes in to be used in the demonstration buffer demo_batch_size: number of samples to be used from the demonstrations buffer, per mpi thread prm_loss_weight: Weight corresponding to the primary loss aux_loss_weight: Weight corresponding to the auxilliary loss also called the cloning loss """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None,) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values()] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = {key: (self.T if key != 'o' else self.T+1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T+1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) global demoBuffer demoBuffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions) #initialize the demo buffer; in the same way as the primary data buffer
def param_placeholder(self, prepend_shape, name=None): return tf.placeholder(dtype=tf.float32, shape=prepend_shape + self.param_shape(), name=name)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])