def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff*entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], self.losses + [U.flatgrad(self.total_loss, var_list)])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def create_update_target(self): q_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope="q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.VARIABLES, scope="tar_q_func") update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) update_target = U.function([], [], updates=[update_target_expr]) return update_target
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) def act(ob, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps) return act
def test_function(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_multikwargs(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) losslist_ref = [] for i in range(10): l = do_update() print(i, l) losslist_ref.append(l) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) adam = MpiAdam(var_list) losslist_test = [] for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l) losslist_test.append(l) np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def learn(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()
def build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") if use_rp: latten_obs = tf.reshape(observations_ph.get(), [-1, input_dim]) rp = tf.random.normal([input_dim, hash_dim], 0, 1 / np.sqrt(hash_dim)) obs_hash_output = tf.matmul(latten_obs, rp) else: obs_hash_output, _ = model_func(observations_ph.get(), num_actions, scope="hash_func", reuse=False) eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) act = U.function( inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=[output_actions, obs_hash_output], givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) return act
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations oldac_na = tf.placeholder( tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder( tf.float32, shape=[None, ac_dim * 2], name="oldac_dist" ) # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate oldlogprob_n = tf.placeholder( tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions wd_dict = {} h1 = tf.nn.tanh( dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh( dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable( "logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([ tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim]) ], 1) sampled_ac_na = tf.random_normal( tf.shape(ac_dist[:, ac_dim:]) ) * ac_dist[:, ac_dim:] + ac_dist[:, : ac_dim] # This is the sampled action we'll perform. logprobsampled_n = -U.sum(tf.log( ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 2.0 * np.pi) * ac_dim - 0.5 * U.sum( tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1) # Logprob of sampled action logprob_n = -U.sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 2.0 * np.pi ) * ac_dim - 0.5 * U.sum( tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), axis=1 ) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = -U.mean( adv_n * logprob_n ) # Loss function that we'll differentiate to get the policy gradient surr_sampled = -U.mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n ]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ( (ob_no, oldac_na, adv_n), surr, surr_sampled ) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = scope_vars(absolute_scope_name(original_scope)) all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope)) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond(mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, ] _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, updates=updates) def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale) return act
def learn(env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=0, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none', capacity=10, warm_start=True): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment ob_space = env.observation_space ac_space = env.action_space # Creating the memory buffer memory = Memory(capacity=capacity, batch_size=n_episodes, horizon=horizon, ob_space=ob_space, ac_space=ac_space) # Building the target policy and saving its parameters pi = make_policy('pi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split('/')[1].startswith('pol')] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Building a set of behavioral policies behavioral_policies = memory.build_policies(make_policy, pi) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([None], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew') clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None)) gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') active_policies = tf.placeholder(dtype=tf.float32, shape=(capacity), name='active_policies') losses_with_name = [] # Total number of trajectories N_total = tf.reduce_sum(active_policies) * n_episodes # Split operations disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon]) rew_split = tf.reshape(rew_ * mask_, [-1, horizon]) mask_split = tf.reshape(mask_, [-1, horizon]) # Policy densities target_log_pdf = pi.pd.logp(ac_) * mask_ target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon]) behavioral_log_pdfs = tf.stack([bpi.pd.logp(ac_) * mask_ for bpi in memory.policies]) # Shape is (capacity, ntraj*horizon) behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs, [memory.capacity, -1, horizon]) reference_behavioral_log_pdf = memory.policies[0].pd.logp(ac_) reference_behavioral_log_pdf_split = tf.stack(tf.split(reference_behavioral_log_pdf * mask_, n_episodes)) reference_log_ratio = target_log_pdf - reference_behavioral_log_pdf reference_log_ratio_split = tf.stack(tf.split(reference_log_ratio * mask_, n_episodes)) # Compute renyi divergencies and sum over time, then exponentiate emp_d2_split = tf.reshape(tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]), [memory.capacity, -1, horizon]) emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2)) # Compute arithmetic and harmonic mean of emp_d2 emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1) emp_d2_arithmetic = tf.reduce_sum(emp_d2_mean * active_policies) / tf.reduce_sum(active_policies) emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum(1 / emp_d2_mean) # Renyi divergence reference_emp_d2_split = tf.stack(tf.split(pi.pd.renyi(memory.policies[0].pd, 2) * mask_, n_episodes)) reference_emp_d2_cum_split = tf.reduce_sum(reference_emp_d2_split, axis=1) reference_empirical_d2 = tf.reduce_mean(tf.exp(reference_emp_d2_cum_split)) # Return processing: clipping, centering, discounting ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf # Reward statistics return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'), (emp_d2_harmonic, 'EmpiricalD2Harmonic'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'is': # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H) target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split, axis=2) # To avoid numerical instability, compute the inversed ratio log_inverse_ratio = behavioral_log_pdf_episode - target_log_pdf_episode iw = 1 / tf.reduce_sum(tf.exp(log_inverse_ratio) * tf.expand_dims(active_policies, -1), axis=0) # Compute also the balance-heuristic weights iw_split = tf.reshape(iw, (memory.capacity, -1)) iw_by_behavioral = tf.reduce_mean(iw_split, axis=1) losses_with_name.append((iw_by_behavioral[0] / tf.reduce_sum(iw_by_behavioral), 'MultiIWFirstRatio')) losses_with_name.append((tf.reduce_max(iw_by_behavioral), 'MultiIWMax')) losses_with_name.append((tf.reduce_sum(iw_by_behavioral), 'MultiIWSum')) losses_with_name.append((tf.reduce_min(iw_by_behavioral), 'MultiIWMin')) # Get the probability by exponentiation #target_pdf_episode = tf.exp(target_log_pdf_episode) #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode) # Get the denominator by averaging over behavioral policies #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24 #iw = target_pdf_episode / behavioral_pdf_mixture iwn = iw / n_episodes # Compute the J _w_return_mean = tf.reduce_sum(ep_return * iwn) # Empirical D2 of the mixture and relative ESS ess_renyi_arithmetic = N_total / emp_d2_arithmetic ess_renyi_harmonic = N_total / emp_d2_harmonic # Log quantities losses_with_name.extend([(tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'), (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'), (ess_renyi_arithmetic, 'ESSRenyiArithmetic'), (ess_renyi_harmonic, 'ESSRenyiHarmonic')]) reference_iw = tf.exp(tf.reduce_sum(reference_log_ratio_split, axis=1)) reference_iwn = reference_iw / n_episodes w_return_mean = tf.reduce_sum(reference_iwn * ep_return) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'max-d2-harmonic': _bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max # TMP ess_renyi = n_episodes / reference_empirical_d2 bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi)) * return_abs_max elif bound == 'max-d2-arithmetic': bound_ = w_return_mean - tf.sqrt((1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum(0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp(-tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) ''' if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) ''' assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [bound_, assert_ops, print_ops]) compute_losses = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) policy_reinit = tf.variables_initializer(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma) sampler = type("SequentialSampler", (object,), {"collect": lambda self, _: seg_gen.__next__()})() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finished...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # Adding batch of trajectories to memory memory.add_trajectory_batch(seg) # Get multiple batches from memory seg_with_memory = memory.get_trajectories() # Get clustered reward reward_matrix = np.reshape(seg_with_memory['disc_rew'] * seg_with_memory['mask'], (-1, horizon)) ep_reward = np.sum(reward_matrix, axis=1) ep_reward = cluster_rewards(ep_reward, reward_clustering) args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = (seg_with_memory['ob'], seg_with_memory['ac'], seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward, seg_with_memory['mask'], iters_so_far, memory.get_active_policies_mask()) def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights > 0 and iters_so_far % save_weights == 0: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb') pickle.dump(theta, file) if not warm_start or memory.get_current_load() == capacity: # Optimize with timed("offline optimization"): theta, improvement = optimize_offline(theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) else: # Reinitialize the policy tf.get_default_session().run(policy_reinit) logger.dump_tabular() env.close()
def learn(env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn( env, policy_func, disc, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) logdir=".", agentName="PPO-Agent", resume=0, num_parallel=0, num_cpu=1, num_extra=0, gan_batch_size=128, gan_num_epochs=5, gan_display_step=40, resume_disc=0, resume_non_disc=0, mocap_path="", gan_replay_buffer_size=1000000, gan_prob_to_put_in_replay=0.01, gan_reward_to_retrain_discriminator=5, use_distance=0, use_blend=0): # Deal with GAN if not use_distance: replay_buf = MyReplayBuffer(gan_replay_buffer_size) data = np.loadtxt( mocap_path + ".dat" ) #"D:/p4sw/devrel/libdev/flex/dev/rbd/data/bvh/motion_simple.dat"); label = np.concatenate((np.ones( (data.shape[0], 1)), np.zeros((data.shape[0], 1))), axis=1) print("Real data label = " + str(label)) mocap_set = Dataset(dict(data=data, label=label), shuffle=True) # Setup losses and stuff # ---------------------------------------- rank = MPI.COMM_WORLD.Get_rank() ob_space = env.observation_space ac_space = env.action_space ob_size = ob_space.shape[0] ac_size = ac_space.shape[0] #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape)) #exit(0) pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vfloss1 = tf.square(pi.vpred - ret) vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param) vfloss2 = tf.square(vpredclipped - ret) vf_loss = .5 * U.mean( tf.maximum(vfloss1, vfloss2) ) # we do the same clipping-based trust region for the value function #vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- sess = tf.get_default_session() avars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) non_disc_vars = [ a for a in avars if not a.name.split("/")[0].startswith("discriminator") ] disc_vars = [ a for a in avars if a.name.split("/")[0].startswith("discriminator") ] #print(str(non_disc_names)) #print(str(disc_names)) #exit(0) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards disc_saver = tf.train.Saver(disc_vars, max_to_keep=None) non_disc_saver = tf.train.Saver(non_disc_vars, max_to_keep=None) saver = tf.train.Saver(max_to_keep=None) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) if not use_distance: if os.path.exists(logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl'): print("Load replay buf") with open( logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl', 'rb') as f: replay_buf = pickle.load(f) else: print("Can't load replay buf " + logdir + "\\" + 'replay_buf_' + str(int(resume / 100) * 100) + '.pkl') iters_so_far = resume if resume_non_disc > 0: non_disc_saver.restore( tf.get_default_session(), os.path.join( os.path.abspath(logdir), "{}-{}".format(agentName + "_non_disc", resume_non_disc))) iters_so_far = resume_non_disc if use_distance: print("Use distance") nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(data) else: nn = None seg_gen = traj_segment_generator(pi, env, disc, timesteps_per_batch, stochastic=True, num_parallel=num_parallel, num_cpu=num_cpu, rank=rank, ob_size=ob_size, ac_size=ac_size, com=MPI.COMM_WORLD, num_extra=num_extra, iters_so_far=iters_so_far, use_distance=use_distance, nn=nn) if resume_disc > 0: disc_saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName + "_disc", resume_disc))) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" logF = open(logdir + "\\" + 'log.txt', 'a') logR = open(logdir + "\\" + 'log_rew.txt', 'a') logStats = open(logdir + "\\" + 'log_stats.txt', 'a') if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'): with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'rb') as f: ob_list = pickle.load(f) else: ob_list = [] dump_training = 0 learn_from_training = 0 if dump_training: # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std saverRMS = tf.train.Saver({ "_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count }) saverRMS.save(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) ob_np_a = np.asarray(ob_list) ob_np = np.reshape(ob_np_a, (-1, ob_size)) [vpred, pdparam] = pi._vpred_pdparam(ob_np) print("vpred = " + str(vpred)) print("pd_param = " + str(pdparam)) with open('training.pkl', 'wb') as f: pickle.dump(ob_np, f) pickle.dump(vpred, f) pickle.dump(pdparam, f) exit(0) if learn_from_training: # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std with open('training.pkl', 'rb') as f: ob_np = pickle.load(f) vpred = pickle.load(f) pdparam = pickle.load(f) num = ob_np.shape[0] for i in range(num): xp = ob_np[i][1] ob_np[i][1] = 0.0 ob_np[i][18] -= xp ob_np[i][22] -= xp ob_np[i][24] -= xp ob_np[i][26] -= xp ob_np[i][28] -= xp ob_np[i][30] -= xp ob_np[i][32] -= xp ob_np[i][34] -= xp print("ob_np = " + str(ob_np)) print("vpred = " + str(vpred)) print("pdparam = " + str(pdparam)) batch_size = 128 y_vpred = tf.placeholder(tf.float32, [ batch_size, ]) y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]]) vpred_loss = U.mean(tf.square(pi.vpred - y_vpred)) vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam)) total_train_loss = vpred_loss + vpdparam_loss #total_train_loss = vpdparam_loss #total_train_loss = vpred_loss #coef = 0.01 #dense_all = U.dense_all #for a in dense_all: # total_train_loss += coef * tf.nn.l2_loss(a) #total_train_loss = vpdparam_loss optimizer = tf.train.AdamOptimizer( learning_rate=0.001).minimize(total_train_loss) d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam), shuffle=not pi.recurrent) sess = tf.get_default_session() sess.run(tf.global_variables_initializer()) saverRMS = tf.train.Saver({ "_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count }) saverRMS.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf")) if resume > 0: saver.restore( tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume))) for q in range(100): sumLoss = 0 for batch in d.iterate_once(batch_size): tl, _ = sess.run( [total_train_loss, optimizer], feed_dict={ pi.ob: batch["ob"], y_vpred: batch["vpred"], y_pdparam: batch["pdparam"] }) sumLoss += tl print("Iteration " + str(q) + " Loss = " + str(sumLoss)) assign_old_eq_new() # set old parameter values to new parameter values # Save as frame 1 try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=1) except: pass #exit(0) if resume > 0: firstTime = False else: firstTime = True # Check accuracy #amocap = sess.run([disc.accuracy], # feed_dict={disc.input: data, # disc.label: label}) #print("Mocap accuracy = " + str(amocap)) #print("Mocap label is " + str(label)) #adata = np.array(replay_buf._storage) #print("adata shape = " + str(adata.shape)) #alabel = np.concatenate((np.zeros((adata.shape[0], 1)), np.ones((adata.shape[0], 1))), axis=1) #areplay = sess.run([disc.accuracy], # feed_dict={disc.input: adata, # disc.label: alabel}) #print("Replay accuracy = " + str(areplay)) #print("Replay label is " + str(alabel)) #exit(0) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel, num_cpu) #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"])) #exit(0) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret, extra = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"], seg["extra"] #ob_list.append(ob.tolist()) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) #print(str(losses)) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) rewmean = np.mean(rewbuffer) logger.record_tabular("EpRewMean", rewmean) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) # Train discriminator if not use_distance: print("Put in replay buf " + str((int)(gan_prob_to_put_in_replay * extra.shape[0] + 1))) replay_buf.add(extra[np.random.choice( extra.shape[0], (int)(gan_prob_to_put_in_replay * extra.shape[0] + 1), replace=True)]) #if iters_so_far == 1: if not use_blend: if firstTime: firstTime = False # Train with everything we got lb = np.concatenate((np.zeros( (extra.shape[0], 1)), np.ones((extra.shape[0], 1))), axis=1) extra_set = Dataset(dict(data=extra, label=lb), shuffle=True) for e in range(10): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): batch = extra_set.next_batch(gan_batch_size) _, l = sess.run( [disc.optimizer_first, disc.loss], feed_dict={ disc.input: np.concatenate( (mbatch['data'], batch['data'])), disc.label: np.concatenate( (mbatch['label'], batch['label'])) }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator: for e in range(gan_num_epochs): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): data = replay_buf.sample(mbatch['data'].shape[0]) lb = np.concatenate((np.zeros( (data.shape[0], 1)), np.ones( (data.shape[0], 1))), axis=1) _, l = sess.run( [disc.optimizer, disc.loss], feed_dict={ disc.input: np.concatenate((mbatch['data'], data)), disc.label: np.concatenate((mbatch['label'], lb)) }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) else: if firstTime: firstTime = False # Train with everything we got extra_set = Dataset(dict(data=extra), shuffle=True) for e in range(10): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): batch = extra_set.next_batch(gan_batch_size) bf = np.random.uniform(0, 1, (gan_batch_size, 1)) onembf = 1 - bf my_label = np.concatenate((bf, onembf), axis=1) my_data = np.multiply(mbatch['data'], bf) + np.multiply( batch['data'], onembf) _, l = sess.run([disc.optimizer_first, disc.loss], feed_dict={ disc.input: my_data, disc.label: my_label }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator: for e in range(gan_num_epochs): i = 0 for mbatch in mocap_set.iterate_once(gan_batch_size): data = replay_buf.sample(mbatch['data'].shape[0]) bf = np.random.uniform(0, 1, (gan_batch_size, 1)) onembf = 1 - bf my_label = np.concatenate((bf, onembf), axis=1) my_data = np.multiply(mbatch['data'], bf) + np.multiply( data, onembf) _, l = sess.run([disc.optimizer_first, disc.loss], feed_dict={ disc.input: my_data, disc.label: my_label }) i = i + 1 # Display logs per step if i % gan_display_step == 0 or i == 1: print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) print( 'discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) # if True: # lb = np.concatenate((np.zeros((extra.shape[0],1)),np.ones((extra.shape[0],1))),axis=1) # extra_set = Dataset(dict(data=extra,label=lb), shuffle=True) # num_r = 1 # if iters_so_far == 1: # num_r = gan_num_epochs # for e in range(num_r): # i = 0 # for batch in extra_set.iterate_once(gan_batch_size): # mbatch = mocap_set.next_batch(gan_batch_size) # _, l = sess.run([disc.optimizer, disc.loss], feed_dict={disc.input: np.concatenate((mbatch['data'],batch['data'])), disc.label: np.concatenate((mbatch['label'],batch['label']))}) # i = i + 1 # # Display logs per step # if i % gan_display_step == 0 or i == 1: # print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) # print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l)) if not use_distance: if iters_so_far % 100 == 0: with open( logdir + "\\" + 'replay_buf_' + str(iters_so_far) + '.pkl', 'wb') as f: pickle.dump(replay_buf, f) with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f: pickle.dump(ob_list, f) if MPI.COMM_WORLD.Get_rank() == 0: logF.write(str(rewmean) + "\n") logR.write(str(seg['mean_ext_rew']) + "\n") logStats.write(logger.get_str() + "\n") logF.flush() logStats.flush() logR.flush() logger.dump_tabular() try: os.remove(logdir + "/checkpoint") except OSError: pass try: saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far) except: pass try: non_disc_saver.save(tf.get_default_session(), os.path.join(logdir, agentName + "_non_disc"), global_step=iters_so_far) except: pass try: disc_saver.save(tf.get_default_session(), os.path.join(logdir, agentName + "_disc"), global_step=iters_so_far) except: pass
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) entcoeff coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) print('=================') print(np.mean(vtargs)) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def run_hoof_no_lamgam( network, env, total_timesteps, timesteps_per_batch, # what to train on kl_range, gamma_range, lam_range, # advantage estimation num_kl, num_gamma_lam, cg_iters=10, seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' MPI = None nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space # +2 for gamma, lambda ob = tf.placeholder(shape=(None, env.observation_space.shape[0] + 2), dtype=env.observation_space.dtype, name='Ob') with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_ratio = U.function( [ob, ac, atarg], ratio) # IS ratio - used for computing IS weights compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator_with_gl(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' kl_range = np.atleast_1d(kl_range) gamma_range = np.atleast_1d(gamma_range) lam_range = np.atleast_1d(lam_range) while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() thbefore = get_flat() rand_gamma = gamma_range[0] + ( gamma_range[-1] - gamma_range[0]) * np.random.rand(num_gamma_lam) rand_lam = lam_range[0] + ( lam_range[-1] - lam_range[0]) * np.random.rand(num_gamma_lam) rand_kl = kl_range[0] + (kl_range[-1] - kl_range[0]) * np.random.rand(num_kl) opt_polval = -10**8 est_polval = np.zeros((num_gamma_lam, num_kl)) ob_lam_gam = [] tdlamret = [] vpred = [] for gl in range(num_gamma_lam): oblg, vpredbefore, atarg, tdlr = add_vtarg_and_adv_without_gl( pi, seg, rand_gamma[gl], rand_lam[gl]) ob_lam_gam += [oblg] tdlamret += [tdlr] vpred += [vpredbefore] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate pol_ob = np.concatenate( (seg['ob'], np.zeros(seg['ob'].shape[:-1] + (2, ))), axis=-1) args = pol_ob, seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=False) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) surrbefore = lossbefore[0] for m, kl in enumerate(rand_kl): lm = np.sqrt(shs / kl) fullstep = stepdir / lm thnew = thbefore + fullstep set_from_flat(thnew) # compute the IS estimates lik_ratio = compute_ratio(*args) est_polval[gl, m] = wis_estimate(seg, lik_ratio) # update best policy found so far if est_polval[gl, m] > opt_polval: opt_polval = est_polval[gl, m] opt_th = thnew opt_kl = kl opt_gamma = rand_gamma[gl] opt_lam = rand_lam[gl] opt_vpredbefore = vpredbefore opt_tdlr = tdlr meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore expectedimprove = g.dot(fullstep) set_from_flat(thbefore) logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) set_from_flat(opt_th) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) ob_lam_gam = np.concatenate(ob_lam_gam, axis=0) tdlamret = np.concatenate(tdlamret, axis=0) vpred = np.concatenate(vpred, axis=0) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (ob_lam_gam, tdlamret), include_final_partial_batch=False, batch_size=num_gamma_lam * 64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpred, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Opt_KL", opt_kl) logger.record_tabular("gamma", opt_gamma) logger.record_tabular("lam", opt_lam) if rank == 0: logger.dump_tabular() return pi
def learn(env, policy_fn, *, timesteps_per_actorbatch, max_timesteps = 0, max_episodes = 0, max_iters = 0, max_seconds = 0, seed, env_id, params): timestr = strftime("%Y-%m-%d %H:%M:%S", gmtime()) global ALPHA global INIT_ALPHA global POP_SIZE global SELECTED_SIZE global action_clip global INIT_MUTATE global action_clip ALPHA = params[0] INIT_ALPHA = ALPHA POP_SIZE = params[1] SELECTED_SIZE = params[2] envID = params[3] solved_score = params[4] action_clip = params[5] RAND_MUT_POWER = params[6] INIT_MUTATE = RAND_MUT_POWER EVAL_ITERS = params[7] degrade = params[8] logger.log("GAR: "+"Degrade: "+str(degrade)+", Alpha: "+str(ALPHA)+", Pop Size: "+str(POP_SIZE)+", Sel Size: "+str(SELECTED_SIZE)+", " "Env: "+envID+", Solved at: "+str(solved_score)+", AC Clip: "+str(action_clip)+", Mutate: "+str(RAND_MUT_POWER)+", Evals: "+str(EVAL_ITERS)) #Env is the enviroment the player acts in ob_space = env.observation_space ac_space = env.action_space #Policy is our player pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy ob = U.get_placeholder_cached(name="ob") import numpy as np np.random.seed(seed) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # traj return - baseline ac = pi.pdtype.sample_placeholder([None]) #Action placeholder reinforce_loss = tf.reduce_sum(pi.pd.neglogp(ac) * ret) #loss var_list = pi.get_trainable_variables() global get_gradient get_gradient = U.function([ob, ac, ret], [U.flatgrad(reinforce_loss, var_list)]) U.initialize() set_from_flat = U.SetFromFlat(pi.get_trainable_variables()) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() stochastic = False lenbuffer = deque(maxlen = 100) # rolling buffer for episode lengths rewbuffer = deque(maxlen = 100) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" flatten_weights = pi.get_Flat_variables()() IND_SIZE = len(flatten_weights) eval_iterations = EVAL_ITERS # change to 5 or 3 best_solution = flatten_weights best_fitness = 0 prev_population = np.ndarray(shape=[POP_SIZE], dtype=np.ndarray) np.random.seed(seed) population = np.random.randn(POP_SIZE, IND_SIZE) prev_fitnesses = np.ndarray(shape=[POP_SIZE]) fitnesses = np.ndarray(shape=[POP_SIZE]) timeout = time.time() + 180000 # 5 Days for g in range(0, GENERATIONS): for i in range(0, POP_SIZE): if g == 0: results = es_eval(env, env_id, pi, population[i], best_solution, eval_iterations, stochastic, timesteps_per_actorbatch, seed, best_fitness, id, set_from_flat) # A new ind is produced via the REINFORCE algorithm population[i] = reinforce(results, population[i], env) # Update individual with directed local search based evolution fitnesses[i] = results[1] logger.log("Ind: " + str(i) + ", with fitness: " + str(fitnesses[i]) +", Alpha: "+str(ALPHA)+", at time " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " generation: " + str(g)) else: if i == 0: # Leaves the first element in place fittest survives population[i] = prev_population[i] fitnesses[i] = prev_fitnesses[i] logger.log("Ind: " + str(i) + ", with fitness: " + str(fitnesses[i]) + ", at time " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " generation: " + str(g)) else: #Evaluate a given ind for trajectories and fitness parent = prev_population[random.randint(0, SELECTED_SIZE)] + np.random.normal(loc=0, scale=RAND_MUT_POWER, size=IND_SIZE) results = es_eval(env, env_id, pi, parent, best_solution, eval_iterations, stochastic, timesteps_per_actorbatch, seed, best_fitness, id, set_from_flat) #A new ind is produced via the REINFORCE algorithm population[i] = reinforce(results, parent, env) #Update individual with directed local search based evolution fitnesses[i] = results[1] logger.log("Ind: " + str(i) + ", with fitness: " + str(fitnesses[i]) + ", Alpha: " + str( ALPHA) + ", at time " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " generation: " + str(g)) # Sort this generation by fitness descending order sorted_inds = fitnesses.argsort()[::-1] fitnesses = fitnesses[sorted_inds] population = population[sorted_inds] best_fitness = fitnesses[0] best_solution = population[0] set_from_flat(best_solution) if time.time() > timeout: logger.log("Ran out of time, best so far:") logger.log("Best Fitness: " + str(fitnesses[0]) + ", at time " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " generation: " + str(g)) break if best_fitness >= solved_score: logger.log("Best Fitness: " + str(fitnesses[0]) + ", at time " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " generation: " + str(g)) break else: logger.log("Best Fitness: "+str(fitnesses[0])+", at time "+ strftime("%Y-%m-%d %H:%M:%S", gmtime())+" generation: "+str(g)) prev_fitnesses = np.copy(fitnesses) prev_population = np.copy(population) logger.log("Best Ind Weights <") for v in best_solution: logger.log(str(v)) logger.log(">")
def init_eta_omega(self, beta, epsilon, init_eta, init_omega): # Here we define the symbolic function for the dual and the gradient self.beta = beta self.epsilon = epsilon # Init dual param values self.param_eta = init_eta self.param_omega = init_omega self.param_eta_non_lin = init_eta self.param_omega_non_lin = init_omega param_eta = tf.placeholder(dtype=tf.float32, shape=[], name="param_eta") param_omega = tf.placeholder(dtype=tf.float32, shape=[], name="param_omega") old_entropy = tf.placeholder(dtype=tf.float32, shape=[], name="old_entropy") varphis = tf.placeholder(dtype=tf.float32, shape=[None, None], name="varphis") Kt = tf.placeholder(dtype=tf.float32, shape=[None, None], name="Kt") prec = tf.placeholder(dtype=tf.float32, shape=[None, None], name="prec") Waa = tf.placeholder(dtype=tf.float32, shape=[None, None], name="Waa") Wsa = tf.placeholder(dtype=tf.float32, shape=[None, None], name="Wsa") wa = tf.placeholder(dtype=tf.float32, shape=[None, None], name="wa") # varphis = ext.new_tensor( # 'varphis', # ndim=2, # dtype=theano.config.floatX # ) # Kt = ext.new_tensor( # 'Kt', # ndim=2, # dtype=theano.config.floatX # ) # prec = ext.new_tensor( # 'prec', # ndim=2, # dtype=theano.config.floatX # ) # Waa = ext.new_tensor( # 'Waa', # ndim=2, # dtype=theano.config.floatX # ) # Wsa = ext.new_tensor( # 'Wsa', # ndim=2, # dtype=theano.config.floatX # ) # wa = ext.new_tensor( # 'wa', # ndim=2, # dtype=theano.config.floatX # ) if self.beta == 0: beta = 0 else: beta = old_entropy - self.beta # beta = self.printt('beta shape: ', beta) # log_action_prob = self.printn('log_action_prob shape: ', log_action_prob) # action_prob = self.printn('action_prob shape: ', action_prob) # q_values = self.printn('q_values shape: ', q_values) # beta = self.printn('beta shape: ', beta) # ha(s): eta * (\varphi(s)^T * K^T * \Sigma^{-1} + W_{sa}) + wa(s)) ha = tf.matmul(varphis, param_eta * tf.matmul(Kt, prec) + Wsa) + wa # hss(s): eta * (\varphi(s)^T * K^T * \Sigma^{-1} * K * \varphi(s)) varphisKt = tf.matmul(varphis, Kt) hss = param_eta * tf.reduce_sum(tf.matmul(varphisKt, prec) * varphisKt, axis=1) Haa = param_eta * prec + Waa # Haa = 0.5 * (Haa + TT.transpose(Haa)) HaaInv = tf.matrix_inverse(Haa) # The two terms 'term1' and 'term2' which come from normalizers of the # 1. Original policy distribution # 2. The distribution after completing the square sigma = tf.matrix_inverse(prec) term1 = -0.5 * param_eta * tf.log( tf.matrix_determinant(2 * np.pi * sigma)) if self.beta == 0: term2 = 0.5 * param_eta * tf.log( tf.matrix_determinant(2 * np.pi * param_eta * HaaInv)) else: term2 = 0.5 * (param_eta + param_omega) * tf.log( tf.matrix_determinant(2 * np.pi * (param_eta + param_omega) * HaaInv)) dual = param_eta * self.epsilon - param_omega * beta + \ term1 + term2 + tf.reduce_mean( 0.5 * (tf.reduce_sum(tf.matmul(ha, HaaInv) * ha, axis=1) - hss)) # Symbolic dual gradient dual_grad = tf.gradients(xs=[param_eta, param_omega], ys=dual) # Eval functions. f_dual = U.function( inputs=[varphis, Kt, prec, Waa, Wsa, wa] + [param_eta, param_omega, old_entropy], outputs=dual, # mode='DebugMode' # TEST ) f_dual_grad = U.function( inputs=[varphis, Kt, prec, Waa, Wsa, wa] + [param_eta, param_omega, old_entropy], outputs=dual_grad, # mode='DebugMode' # TEST ) # # # TEST # d0 = param_eta * self.epsilon - param_omega * beta # d1 = term1 # d2 = term2 # d3 = TT.mean(0.5 * (TT.sum(TT.dot(ha, HaaInv) * ha, axis=1))) # d4 = TT.mean(hss) # f_duals = ext.compile_function( # inputs=[varphis, Kt, prec, Waa, Wsa, wa] + [param_eta, param_omega, old_entropy], # outputs=[d0, d1, d2, d3, d4] # ) # # END TEST self.opt_info = dict( f_dual=f_dual, f_dual_grad=f_dual_grad, # f_duals=f_duals, # TEST )
def learn(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- #seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: data_path = '/Users/wjh720/Desktop/Tmp/para_%i/' % (timesteps_per_actorbatch / 100) U.load_state(data_path + 'para') test(pi, env, timesteps_per_actorbatch, stochastic=True)
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={})) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={})) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps = 0, max_episodes = 0, max_iters = 0, max_seconds = 0, # time constraint callback = None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon = 1e-5, rho = 0.95, update_step_threshold = 100, shift=0, schedule = 'constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy td_v_target = tf.placeholder(dtype = tf.float32, shape = [1, 1]) # V target lrmult = tf.placeholder(name = 'lrmult', dtype = tf.float32, shape = []) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name = "ob") ac = pi.pdtype.sample_placeholder([]) adv = tf.placeholder(dtype = tf.float32, shape = [1, 1]) ent = pi.pd.entropy() vf_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target)) vf_losses = [vf_loss] vf_loss_names = ["vf_loss"] pol_loss = -tf.reduce_mean(adv * pi.pd.logp(ac)) pol_losses = [pol_loss] pol_loss_names = ["pol_loss"] var_list = pi.get_trainable_variables() vf_var_list = [v for v in var_list if v.name.split("/")[1].startswith( "vf")] pol_var_list = [v for v in var_list if v.name.split("/")[1].startswith( "pol")] # Train V function vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_losses + [U.flatgrad(vf_loss, vf_var_list)]) vf_adam = MpiAdam(vf_var_list, epsilon = adam_epsilon) # vf_optimizer = tf.train.AdamOptimizer(learning_rate = lrmult, epsilon = adam_epsilon) # vf_train_op = vf_optimizer.minimize(vf_loss, vf_var_list) # Train Policy pol_lossandgrad = U.function([ob, ac, adv, lrmult, td_v_target], pol_losses + [U.flatgrad(pol_loss, pol_var_list)]) pol_adam = MpiAdam(pol_var_list, epsilon = adam_epsilon) # pol_optimizer = tf.train.AdamOptimizer(learning_rate = 0.1 * lrmult, epsilon = adam_epsilon) # pol_train_op = pol_optimizer.minimize(pol_loss, pol_var_list) # Computation compute_v_pred = U.function([ob], [pi.vpred]) # vf_update = U.function([ob, td_v_target], [vf_train_op]) # pol_update = U.function([ob, ac, adv], [pol_train_op]) U.initialize() vf_adam.sync() pol_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic = False) global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen = 100) # rolling buffer for episode lengths rewbuffer = deque(maxlen = 100) # rolling buffer for episode rewards Transition = collections.namedtuple("Transition", ["ob", "ac", "reward", "next_ob", "done"]) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" normalizer = Normalizer(1) # Step learning, this loop now indicates episodes while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # logger.log("********** Episode %i ************" % episodes_so_far) rac_alpha = optim_stepsize * cur_lrmult rac_beta = optim_stepsize * cur_lrmult * 0.1 # print("rac_alpha=", rac_alpha) # print("rac_beta=", rac_beta) if timesteps_so_far == 0: # result_record() seg = seg_gen.__next__() lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) result_record() ob = env.reset() # episode = [] cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... obs = [] t_0 = 0 pol_gradients = [] record = False for t in itertools.count(): ac, vpred = pi.act(stochastic = True, ob = ob) origin_ac = ac ac = np.clip(ac, ac_space.low, ac_space.high) obs.append(ob) next_ob, rew, done, _ = env.step(ac) if env.spec._env_name == "MountainCarContinuous": rew = rew - np.abs(next_ob[0] - env.unwrapped.goal_position) ac = origin_ac # rew = np.clip(rew, -1., 1.) # episode.append(Transition(ob=ob.reshape((1, ob.shape[0])), ac=ac.reshape((1, ac.shape[0])), reward=rew, next_ob=next_ob.reshape((1, ob.shape[0])), done=done)) original_rew = rew if env.spec._env_name != "InvertedPendulumBulletEnv": normalizer.update(rew) rew = normalizer.normalize(rew) cur_ep_ret += (original_rew - shift) cur_ep_len += 1 timesteps_so_far += 1 # Compute v target and TD v_target = rew + gamma * np.array(compute_v_pred(next_ob.reshape((1, ob.shape[0])))) adv = v_target - np.array(compute_v_pred(ob.reshape((1, ob.shape[0])))) # Update V and Update Policy vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])), v_target, rac_alpha) # vf_g = adv * ob.reshape((1, ob.shape[0])) vf_adam.update(vf_g, rac_alpha) pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])), ac, adv, rac_beta, v_target) pol_gradients.append(pol_g) # if t == update_step_threshold: if t % update_step_threshold == 0 and t > 0: scaling_factor = [rho ** (t - i) for i in range(t_0, t)] coef = update_step_threshold / np.sum(scaling_factor) sum_weighted_pol_gradients = np.sum( [scaling_factor[i] * pol_gradients[i] for i in range(len(scaling_factor))], axis = 0) pol_adam.update(coef * sum_weighted_pol_gradients, rac_beta) pol_gradients = [] t_0 = t ob = next_ob if timesteps_so_far % 10000 == 0: record = True if done: if len(pol_gradients) > 0: scaling_factor = [rho ** (t - i) for i in range(t_0, t)] coef = (t - t_0) / np.sum(scaling_factor) sum_weighted_pol_gradients = np.sum( [scaling_factor[i] * pol_gradients[i] for i in range(len(scaling_factor))], axis = 0) pol_adam.update(coef * sum_weighted_pol_gradients, rac_beta) pol_gradients = [] t_0 = 0 # print( # "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len)) # ep_rets.append(cur_ep_ret) # returns of completed episodes in this segment # ep_lens.append(cur_ep_len) # lengths of .. # lenbuffer.append(cur_ep_len) # rewbuffer.append(cur_ep_ret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(np.array(obs)) # update running mean/std for normalization iters_so_far += 1 episodes_so_far += 1 ob = env.reset() if record: seg = seg_gen.__next__() lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) result_record() record = False break
def learn(args, env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) writer=None ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(name='atarg', dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(name='ret', dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # ob = U.get_placeholder_cached(name="ob") ob = {} ob['adj'] = U.get_placeholder_cached(name="adj") ob['node'] = U.get_placeholder_cached(name="node") # cond_ob = {} # cond_ob['adj'] = U.get_placeholder(shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='cond_adj') # cond_ob['node'] = U.get_placeholder(shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='cond_node') #cond_ob['ori_adj'] = tf.placeholder(shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='cond_adj') cond_smi_vec = U.get_placeholder(name='cond_smi', dtype=tf.float32, shape=[None, args.smi_max_length, len(env.smile_chars)]) cond_sample = U.get_placeholder(name='normal_cond_sample', dtype=tf.float32, shape=[None, 1, ob_space['node'].shape[1]]) # cond_mean = tf.placeholder(shape=[None, 1, None, args.emb_size], name='cond_mean', dtype=tf.float32) # cond_logstd = tf.placeholder(shape=[None, 1, None, args.emb_size], name='cond_logstd', dtype=tf.float32) ob_gen = {} ob_gen['adj'] = U.get_placeholder(shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_gen') ob_gen['node'] = U.get_placeholder(shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_gen') ob_real = {} ob_real['adj'] = U.get_placeholder(shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_real') ob_real['node'] = U.get_placeholder(shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_real') ob_sequence_real = {} ob_sequence_real['adj'] = U.get_placeholder(shape=[None, env.max_action, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_sequence_real') ob_sequence_real['node'] = U.get_placeholder(shape=[None, env.max_action, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_sequence_real') ac_sequence_real = U.get_placeholder(shape=[None, env.max_action, 4], dtype=tf.int64, name='ac_sequence_real') ac = tf.placeholder(dtype=tf.int64, shape=[None, 4], name='ac_real') if args.has_attention == 0: cond_mean, cond_logstd = pi.encoder(args, cond_smi_vec, ob_space['node'].shape[1]) kl_loss = tf.reduce_mean(-0.5 * tf.reduce_sum(tf.reduce_sum(1 + cond_logstd - tf.square(cond_mean) - tf.exp(cond_logstd), axis=2), axis=1)) else: kl_loss = tf.constant(0, dtype=tf.float32) ## PPO loss kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_logp = pi.pd.logp(ac) oldpi_logp = oldpi.pd.logp(ac) ratio_log = pi.pd.logp(ac) - oldpi.pd.logp(ac) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss #+ args.kl_ppo_ratio * kl_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] ## Expert loss #reconstruction_loss = -tf.reduce_mean(pi_logp) #+ args.kl_expert_ratio * kl_loss #print(pi.decoder(args, ob_sequence_real['adj'][:, 2, :, :, :], ob_sequence_real['node'][:, 2, :, :, :], pi.sample, ac_sequence_real[:, 2, :])[0].logp(ac_sequence_real[:, 2, :]).shape) # generate_cross_entropy = lambda cross_entropy_loss, idx: (tf.add(cross_entropy_loss, pi.decoder(args, ob_sequence_real['adj'][:, idx, :, :, :], ob_sequence_real['node'][:, idx, :, :, :], pi.sample, ob_space, ac_sequence_real[:, idx, :], env.atom_type_num)[0].logp(ac_sequence_real[:, idx, :])), tf.add(idx, 1)) # reconstruction_loss_total, final_idx = tf.while_loop(lambda cross_entropy_loss, idx: idx < env.max_action, generate_cross_entropy, (tf.zeros((tf.shape(ob_sequence_real['adj'])[0],), dtype=tf.float32), tf.constant(0))) # reconstruction_loss = -tf.reduce_mean(reconstruction_loss_total) # loss_expert = reconstruction_loss + args.kl_ratio * kl_loss if args.has_attention == 1: ori_loss_expert = -tf.reduce_mean(pi_logp) else: ori_loss_expert = -tf.reduce_mean(pi_logp) + args.kl_ratio * kl_loss ## Discriminator loss # loss_d_step, _, _ = discriminator(ob_real, ob_gen,args, name='d_step') # loss_d_gen_step,_ = discriminator_net(ob_gen,args, name='d_step') # loss_d_final, _, _ = discriminator(ob_real, ob_gen,args, name='d_final') # loss_d_gen_final,_ = discriminator_net(ob_gen,args, name='d_final') step_pred_real, step_logit_real = discriminator_net(ob_real, args, name='d_step') step_pred_gen, step_logit_gen = discriminator_net(ob_gen, args, name='d_step') loss_d_step_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=step_logit_real, labels=tf.ones_like(step_logit_real)*0.9)) loss_d_step_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) loss_d_step = loss_d_step_real + loss_d_step_gen if args.gan_type == 'normal': loss_g_step_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) loss_g_step_gen = loss_g_step_gen # + args.kl_g_ratio * kl_loss elif args.gan_type == 'recommend': loss_g_step_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=step_logit_gen, labels=tf.ones_like(step_logit_gen)*0.9)) loss_g_step_gen = loss_g_step_gen # + args.kl_g_ratio * kl_loss elif args.gan_type == 'wgan': loss_d_step, _, _ = discriminator(ob_real, ob_gen, args, name='d_step') loss_d_step = loss_d_step * -1 loss_g_step_gen, _ = discriminator_net(ob_gen, args, name='d_step') loss_g_step_gen = loss_g_step_gen # + args.kl_g_ratio * kl_loss final_pred_real, final_logit_real = discriminator_net(ob_real, args, name='d_final') final_pred_gen, final_logit_gen = discriminator_net(ob_gen, args, name='d_final') loss_d_final_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=final_logit_real, labels=tf.ones_like(final_logit_real)*0.9)) loss_d_final_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) loss_d_final = loss_d_final_real + loss_d_final_gen if args.gan_type == 'normal': loss_g_final_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) loss_g_final_gen = loss_g_final_gen elif args.gan_type == 'recommend': loss_g_final_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=final_logit_gen, labels=tf.ones_like(final_logit_gen)*0.9)) loss_g_final_gen = loss_g_final_gen elif args.gan_type == 'wgan': loss_d_final, _, _ = discriminator(ob_real, ob_gen, args, name='d_final') loss_d_final = loss_d_final * -1 loss_g_final_gen, _ = discriminator_net(ob_gen, args, name='d_final') loss_g_final_gen = loss_g_final_gen var_list_pi = pi.get_trainable_variables() var_list_pi_stop = [var for var in var_list_pi if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name)] #var_list_encoder = [var for var in tf.global_variables() if 'cond_encoder' in var.name] var_list_d_step = [var for var in tf.global_variables() if 'd_step' in var.name] var_list_d_final = [var for var in tf.global_variables() if 'd_final' in var.name] ## loss update function lossandgrad_ppo = U.function([ob['adj'], ob['node'], cond_smi_vec, cond_sample, ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list_pi)]) # lossandgrad_seq_expert = U.function([ob_sequence_real['adj'], ob_sequence_real['node'], cond_smi_vec, cond_sample, ac_sequence_real], [loss_expert, kl_loss, U.flatgrad(loss_expert, var_list_pi)]) lossandgrad_expert = U.function([ob['adj'], ob['node'], cond_smi_vec, cond_sample, ac, pi.ac_real], [ori_loss_expert, kl_loss, U.flatgrad(ori_loss_expert, var_list_pi)]) lossandgrad_attention_expert = U.function([ob['adj'], ob['node'], cond_smi_vec, ac, pi.ac_real], [ori_loss_expert, U.flatgrad(ori_loss_expert, var_list_pi)]) # lossandgrad_expert_stop = U.function([ob['adj'], ob['node'], cond_smi_vec, cond_sample, ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi_stop)]) #lossandgrad_kl = U.function([cond_smi_vec], [kl_loss, U.flatgrad(kl_loss, var_list_encoder)]) lossandgrad_d_step = U.function([ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)]) lossandgrad_d_final = U.function([ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_final, U.flatgrad(loss_d_final, var_list_d_final)]) loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node'], cond_smi_vec], loss_g_step_gen) loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node'], cond_smi_vec], loss_g_final_gen) adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon) #adam_encoder = MpiAdam(var_list_encoder, epsilon=adam_epsilon) adam_pi_stop = MpiAdam(var_list_pi_stop, epsilon=adam_epsilon) adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon) adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) # # compute_losses_expert = U.function([ob['adj'], ob['node'], ac, pi.ac_real], # loss_expert) compute_losses = U.function([ob['adj'], ob['node'], cond_smi_vec, cond_sample, ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult], losses) # Prepare for rollouts # ---------------------------------------- episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths lenbuffer_valid = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_env = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_step = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final_stat = deque(maxlen=100) # rolling buffer for episode rewardsn #seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch, True, loss_g_gen_step_func, loss_g_gen_final_func) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch, True, loss_g_gen_step_func, loss_g_gen_final_func) U.initialize() if args.load == 1: try: fname = './ckpt/' + args.name_full + '_' + args.reward_type + '_'+str(args.has_cond)+'_' +str(args.rl_start)+'_'+ str(int(args.recons_ratio))+'_'+str(int(args.qed_ratio))+'_'+str(4800) # load sess = tf.get_default_session() # sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(var_list_pi) saver.restore(sess, fname) iters_so_far = int(fname.split('_')[-1])+1 print('model restored!', fname, 'iters_so_far:', iters_so_far) except: print(fname, 'ckpt not found, start with iters 0') #U.initialize() # adam_pi.sync() # adam_pi_stop.sync() # adam_d_step.sync() # adam_d_final.sync() # # counter = 0 # level = 0 ## start training if args.is_train == 1: print("======================Start training=====================") #U.initialize() adam_pi.sync() adam_pi_stop.sync() adam_d_step.sync() adam_d_final.sync() counter = 0 level = 0 batch_iterator = env.make_batch_iterator(args, optim_batchsize) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() # expert_seg = batch_iterator.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob_adj, ob_node, cond_smi_vec, normal_cond_sample, ac, atarg, tdlamret = seg["ob_adj"], seg["ob_node"], seg[ "cond_smi_vec"], seg["cond_sample"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob_adj=ob_adj, ob_node=ob_node, cond_smi_vec=cond_smi_vec, normal_cond_sample=normal_cond_sample, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob_adj.shape[0] # inner training loop, train policy for i_optim in range(optim_epochs): loss_expert = 0 loss_expert_stop = 0 expert_kl_loss = 0 g_expert = 0 g_expert_stop = 0 expert_g_kl = 0 rl_kl_loss = 0 rl_g_kl = 0 loss_d_step = 0 loss_d_final = 0 loss_kl = 0 g_ppo = 0 g_d_step = 0 g_d_final = 0 pretrain_shift = 5 # batch = d.next_batch(optim_batchsize) # kl_loss, g_kl = lossandgrad_kl(batch["cond_smi_vec"]) # kl_loss = np.mean(kl_loss) # adam_encoder.update(g_kl, optim_stepsize * cur_lrmult) ## Expert if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift: ## Expert train # ob_experts, ac_experts, ori_smis = env.get_seq_expert(optim_batchsize, args.samples_num) # ori_smi_vec = env.batch_smi2vec(args, ori_smis) # samples = np.random.randn(optim_batchsize, 1, ob_experts['node'].shape[-1]) # for k in range(args.samples_num): # print(k) # loss_expert, loss_kl, g_expert = lossandgrad_expert(ob_experts['adj'][:, k, :, :, :], ob_experts['node'][:, k, :, :, :], ori_smi_vec, samples, ac_experts[:, k, :], ac_experts[:, k, :]) # adam_pi.update(g_expert, optim_stepsize * cur_lrmult) ob_expert, ac_expert, ori_smi = env.get_ori_expert(optim_batchsize) ori_smi_vec = env.batch_smi2vec(args, ori_smi) if args.has_attention == 0: samples = np.random.randn(optim_batchsize, 1, ob_expert['node'].shape[-2]) loss_expert, loss_kl, g_expert = lossandgrad_expert(ob_expert['adj'], ob_expert['node'], ori_smi_vec, samples, ac_expert, ac_expert) else: loss_expert, g_expert = lossandgrad_attention_expert(ob_expert['adj'], ob_expert['node'], ori_smi_vec, ac_expert, ac_expert) # batch_data = np.random.choice(expert_seg, optim_batchsize) # batch_adj_trajs, batch_node_trajs, batch_ac_trajs, batch_smis = make_batch(batch_data) # batch_smis_vec = env.batch_smi2vec(args, batch_smis) # samples = np.random.randn(optim_batchsize, 1, batch_node_trajs.shape[-1]) # loss_expert, loss_kl, g_expert = lossandgrad_seq_expert(batch_adj_trajs, batch_node_trajs, batch_smis_vec, samples, batch_ac_trajs) ## PPO if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: assign_old_eq_new() # set old parameter values to new parameter values batch = d.next_batch(optim_batchsize) #rl_kl_loss, rl_g_kl = lossandgrad_kl(batch["cond_ob_adj"], batch["cond_ob_node"]) #rl_kl_loss = np.mean(rl_kl_loss) #adam_encoder.update(rl_g_kl, optim_stepsize * cur_lrmult) # ppo # if args.has_ppo==1: if iters_so_far >= args.rl_start+pretrain_shift: # start generator after discriminator trained a well.. *newlosses, g_ppo = lossandgrad_ppo(batch["ob_adj"], batch["ob_node"], batch["cond_smi_vec"], batch["normal_cond_sample"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses_ppo = newlosses if args.has_d_step == 1 and i_optim >= optim_epochs//2: # update step discriminator ob_expert, _, _ = env.get_ori_expert(optim_batchsize, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) loss_d_step, g_d_step = lossandgrad_d_step(ob_expert["adj"], ob_expert["node"], batch["ob_adj"], batch["ob_node"]) adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult) loss_d_step = np.mean(loss_d_step) if args.has_d_final == 1 and i_optim >= optim_epochs//4*3: # update final discriminator ob_expert, _, _ = env.get_ori_expert(optim_batchsize, is_final=True, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) seg_final_adj, seg_final_node = traj_final_generator(args, pi, copy.deepcopy(env), optim_batchsize, True) # update final discriminator loss_d_final, g_d_final = lossandgrad_d_final(ob_expert["adj"], ob_expert["node"], seg_final_adj, seg_final_node) # loss_d_final, g_d_final = lossandgrad_d_final(ob_expert["adj"], ob_expert["node"], ob_adjs, ob_nodes) adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult) # print(seg["ob_adj_final"].shape) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # update generator # adam_pi_stop.update(0.1*g_expert_stop, optim_stepsize * cur_lrmult) # if g_expert==0: # adam_pi.update(g_ppo, optim_stepsize * cur_lrmult) # else: #adam_encoder.update(rl_g_kl + expert_g_kl, optim_stepsize * cur_lrmult) adam_pi.update(0.2*g_ppo+0.1*g_expert, optim_stepsize * cur_lrmult) loss_kl = np.mean(loss_kl) # WGAN # if args.has_d_step == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_step] # if args.has_d_final == 1: # clip_D = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in var_list_d_final] # ## PPO val # if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: # logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): #print(batch["vtarg"].shape) newlosses = compute_losses(batch["ob_adj"], batch["ob_node"], batch["cond_smi_vec"], batch["normal_cond_sample"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) #print(kl_loss) if writer is not None: writer.add_scalar("loss_expert", loss_expert, iters_so_far) writer.add_scalar("KL_loss", loss_kl, iters_so_far) writer.add_scalar("loss_expert_stop", loss_expert_stop, iters_so_far) # no use writer.add_scalar("loss_d_step", loss_d_step, iters_so_far) writer.add_scalar("loss_d_final", loss_d_final, iters_so_far) writer.add_scalar('grad_expert_min', np.amin(g_expert), iters_so_far) writer.add_scalar('grad_expert_max', np.amax(g_expert), iters_so_far) writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert), iters_so_far) writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_norm', np.linalg.norm(g_expert_stop), iters_so_far) writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far) writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far) writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo), iters_so_far) writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far) writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far) writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step), iters_so_far) writer.add_scalar('g_d_final_min', np.amin(g_d_final), iters_so_far) writer.add_scalar('g_d_final_max', np.amax(g_d_final), iters_so_far) writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final), iters_so_far) writer.add_scalar('learning_rate', optim_stepsize * cur_lrmult, iters_so_far) for (lossval, name) in zipsame(meanlosses, loss_names): # logger.record_tabular("loss_"+name, lossval) if writer is not None: writer.add_scalar("loss_"+name, lossval, iters_so_far) # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if writer is not None: writer.add_scalar("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), iters_so_far) # ??? lrlocal = (seg["ep_lens"],seg["ep_lens_valid"], seg["ep_rets"], seg["ep_rets_env"],seg["ep_rets_d_step"],seg["ep_rets_d_final"],seg["ep_final_rew"],seg["ep_final_rew_stat"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, lens_valid, rews, rews_env, rews_d_step,rews_d_final, rews_final,rews_final_stat = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) lenbuffer_valid.extend(lens_valid) rewbuffer.extend(rews) rewbuffer_d_step.extend(rews_d_step) rewbuffer_d_final.extend(rews_d_final) rewbuffer_env.extend(rews_env) rewbuffer_final.extend(rews_final) rewbuffer_final_stat.extend(rews_final_stat) # logger.record_tabular("EpLenMean", np.mean(lenbuffer)) # logger.record_tabular("EpRewMean", np.mean(rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) if writer is not None: writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far) writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid), iters_so_far) writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far) writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step), iters_so_far) writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final), iters_so_far) writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env), iters_so_far) writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final), iters_so_far) writer.add_scalar("EpRewFinalStatMean", np.mean(rewbuffer_final_stat), iters_so_far) writer.add_scalar("EpThisIter", len(lens), iters_so_far) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # logger.record_tabular("EpisodesSoFar", episodes_so_far) # logger.record_tabular("TimestepsSoFar", timesteps_so_far) # logger.record_tabular("TimeElapsed", time.time() - tstart) if writer is not None: writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far) writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far) writer.add_scalar("TimeElapsed", time.time() - tstart, iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: with open('molecule_gen/' + args.name_full +'_'+args.reward_type+'_'+str(args.smi_importance)+'.csv', 'a') as f: f.write('***** Iteration {} *****\n'.format(iters_so_far)) # save if iters_so_far % args.save_every == 0: fname = './ckpt/' + args.name_full + '_' + args.reward_type + '_'+str(args.has_cond)+'_' + str(args.rl_start)+'_'+str(args.recons_ratio)+'_'+str(args.qed_ratio)+'_'+str(iters_so_far) saver = tf.train.Saver(var_list_pi) saver.save(tf.get_default_session(), fname) print('model saved!', fname) # fname = os.path.join(ckpt_dir, task_name) # os.makedirs(os.path.dirname(fname), exist_ok=True) # saver = tf.train.Saver() # saver.save(tf.get_default_session(), fname) # if iters_so_far==args.load_step: iters_so_far += 1 counter += 1 if counter % args.curriculum_step and counter // args.curriculum_step < args.curriculum_num: level += 1 else: print("=======================================Start generating=========================================") while True: seg = seg_gen.__next__()
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ # epsilon-greedy 策略,选取随机动作还是当前最优动作 with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") # epsilon 参数 eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) # 网络前向传播,计算各个动作Q值 q_values = q_func(observations_ph.get(), num_actions, scope="q_func") deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] # tf.random_uniform() : 从均匀分布中输出随机值,第一项为shape, # 此处tf.stack([batch_size]) 的输出应当为[batch_size] # random_actions 则为[5, 1, 3, 4, ...,] 其个数为batch_size random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) # epsilon 策略, 计算一个bool值, chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps # tf.where() : Return the elements, either from `x` or `y`, depending on the `condition`. # the output should be taken from `x` (if true) or `y` (if false). stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) # Return `true_fn()` if the predicate `pred` is true else `false_fn()`.根据stochastic_ph返回两种action output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign( tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = U.function( inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={ update_eps_ph: -1.0, stochastic_ph: True }, updates=[update_eps_expr]) def act(ob, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps) return act
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) ### for binary actions ##### self.pdtype = pdtype = MultiCategoricalPdType( low=np.zeros_like(ac_space.low, dtype=np.int32), high=np.ones_like(ac_space.high, dtype=np.int32)) gaussian_fixed_var = True binary = True ############################# sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0))) #tanh self.vpred = tf.layers.dense( last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( tf.layers.dense( last_out, hid_size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0))) #tanh if gaussian_fixed_var and isinstance( ac_space, gym.spaces.Box) and binary == False: mean = tf.layers.dense( last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable( name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense( last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) # logstd = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='logstd', kernel_initializer=U.normc_initializer(0.01)) # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn( *, network, env, eval_env, make_eval_env, env_id, seed, beta, total_timesteps, timesteps_per_batch, # what to train on #num_samples=(1500,), num_samples=(1, ), #horizon=(5,), horizon=(2, ), #num_elites=(10,), num_elites=(1, ), max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, TRPO=False, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, #eval_targs=('mbmf',), eval_targs=('mf', ), quant=2, # For mbl.step mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', copos=True, **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete) ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append( ('MeanRew', 'MBL_COPOS', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'COPOS', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Initialize eta, omega optimizer if discrete_ac_space: init_eta = 1 init_omega = 0.5 eta_omega_optimizer = EtaOmegaOptimizerDiscrete( beta, max_kl, init_eta, init_omega) else: init_eta = 0.5 init_omega = 2.0 #????eta_omega_optimizer details????? eta_omega_optimizer = EtaOmegaOptimizer(beta, max_kl, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() if TRPO: shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log( "Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log( "violated KL constraint. shrinking step.") elif improve < 0: logger.log( "surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) else: copos_update_dir = stepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) tmp_ob = np.zeros( (1, ) + env.observation_space.shape ) # We assume that entropy does not depend on the NN # Optimize eta and omega if discrete_ac_space: entropy = lossbefore[4] #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1)) eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy) else: Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) varphis = pi.get_varphis(ob) #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] old_ent = lossbefore[4] eta, omega = eta_omega_optimizer.optimize( w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta: " + str(eta) + " and omega: " + str(omega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta( current_theta_beta) if discrete_ac_space: # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args, discrete_ac_space) logger.log("Updated eta, eta: " + str(eta)) set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) # Find proper omega for new eta. Use old policy parameters first. eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # do line search for ratio for non-linear "beta" parameter values #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, # max_kl, beta, args) # set ratio to 1 if we do not use beta ratio line search ratio = 1 #print("ratio from line search: " + str(ratio)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + ratio * w_beta.reshape( -1, ) / eta else: for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args) logger.log("Updated eta, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # Use final policy logger.log("Final eta: " + str(eta) + " and omega: " + str(omega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta)) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) ##copos specific over if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) #cg over for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) #policy update over with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #if iters_so_far= #print(logger.get_dir()) #print(targs_names) # if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # else: # logger.dump_tabular() # if iters_so_far==21: # sys.exit() # ----------- #logger.dump_tabular() yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None): """Creates the act function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ with tf.variable_scope(scope, reuse=reuse): observations_ph = make_obs_ph("observation") stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) q_values = q_func(observations_ph.get(), num_actions, scope="q_func") #inpt = observations_ph.get() #q_values = q_values + 10000 * inpt[:,-1,:36] deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations_ph.get())[0] #v = tf.cast(tf.argmax(inpt[:,-1,:36], axis=1), tf.float32) #w = 35 - v #random_actions = tf.random_uniform(tf.stack([batch_size])) * w #random_actions = tf.ceil(random_actions) + v #random_actions = tf.cast(random_actions, tf.int64) #print("ok") random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True}, updates=[update_eps_expr]) def act(ob, stochastic=True, update_eps=-1): return _act(ob, stochastic, update_eps) return act
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, rho=0.95, # Gradient weighting factor update_step_threshold=100, # Updating step threshold schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return td_v_target = tf.placeholder(dtype=tf.float32, shape=[1, 1]) # V target for RAC lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule # adv = tf.placeholder(dtype = tf.float32, shape = [1, 1]) # Advantage function for RAC clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] vf_rac_loss = tf.reduce_mean(tf.square(pi.vpred - td_v_target)) vf_rac_losses = [vf_rac_loss] vf_rac_loss_names = ["vf_rac_loss"] pol_rac_loss_surr1 = atarg * pi.pd.neglogp(ac) * ratio pol_rac_loss_surr2 = tf.clip_by_value( ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg * pi.pd.neglogp( ac) # pol_rac_loss = tf.reduce_mean( tf.minimum(pol_rac_loss_surr1, pol_rac_loss_surr2)) pol_rac_losses = [pol_rac_loss] pol_rac_loss_names = ["pol_rac_loss"] var_list = pi.get_trainable_variables() # vf_final_var_list = [v for v in var_list if v.name.split("/")[1].startswith( # "vf")] # pol_final_var_list = [v for v in var_list if v.name.split("/")[1].startswith( # "pol")] vf_final_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") and v.name.split("/")[2].startswith("final") ] pol_final_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") and v.name.split("/")[2].startswith("final") ] compatible_feature = U.flatgrad(pi.pd.neglogp(ac), pol_final_var_list) # compatible_feature = tf.reshape(compatible_feature, [compatible_feature.get_shape().as_list()[0], 1]) # compatible_feature_product = compatible_feature * tf.transpose(compatible_feature) # # omage_t_next = tf.matmul(tf.eye(compatible_feature.get_shape().as_list()[0]) - alpha * compatible_feature_product, omega_t)\ # + alpha * adv * compatible_feature # Train V function vf_lossandgrad = U.function([ob, td_v_target, lrmult], vf_rac_losses + [U.flatgrad(vf_rac_loss, vf_final_var_list)]) vf_adam = MpiAdam(vf_final_var_list, epsilon=adam_epsilon) # Train Policy pol_lossandgrad = U.function( [ob, ac, atarg, lrmult], pol_rac_losses + [U.flatgrad(pol_rac_loss, pol_final_var_list)]) pol_adam = MpiAdam(pol_final_var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) compute_v_pred = U.function([ob], [pi.vpred]) get_pol_weights_num = np.sum( [np.prod(v.get_shape().as_list()) for v in pol_final_var_list]) get_compatible_feature = U.function([ob, ac], [compatible_feature]) U.initialize() adam.sync() pol_adam.sync() vf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, best_fitness episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" seg = None # omega_t = np.random.rand(get_pol_weights_num) omega_t = np.zeros(get_pol_weights_num) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) t = 0 ac = env.action_space.sample( ) # not used, just so we have the datatype new = True # marks if we're on first timestep of an episode ob = env.reset() cur_ep_ret = 0 # return in current episode cur_ep_len = 0 # len of current episode ep_rets = [] # returns of completed episodes in this segment ep_lens = [] # lengths of ... horizon = timesteps_per_actorbatch # Initialize history arrays obs = np.array([ob for _ in range(horizon)]) rews = np.zeros(horizon, 'float32') vpreds = np.zeros(horizon, 'float32') news = np.zeros(horizon, 'int32') acs = np.array([ac for _ in range(horizon)]) prevacs = acs.copy() rac_alpha = optim_stepsize * cur_lrmult * 0.1 rac_beta = optim_stepsize * cur_lrmult * 0.001 # from tqdm import tqdm # for t in tqdm(itertools.count(), ascii = True): pol_gradients = [] t_0 = 0 for t in itertools.count(): if timesteps_so_far % 10000 == 0 and timesteps_so_far > 0: result_record() prevac = ac ac, vpred = pi.act(stochastic=True, ob=ob) # Slight weirdness here because we need value function at time T # before returning segment [0, T-1] so we get the correct # terminal value if t > 0 and t % horizon == 0: seg = { "ob": obs, "rew": rews, "vpred": vpreds, "new": news, "ac": acs, "prevac": prevacs, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets, "ep_lens": ep_lens } ep_rets = [] ep_lens = [] break i = t % horizon obs[i] = ob vpreds[i] = vpred news[i] = new acs[i] = ac prevacs[i] = prevac if env.spec._env_name == "LunarLanderContinuous": ac = np.clip(ac, -1.0, 1.0) next_ob, rew, new, _ = env.step(ac) # Compute v target and TD v_target = rew + gamma * np.array( compute_v_pred(next_ob.reshape((1, ob.shape[0])))) adv = v_target - np.array( compute_v_pred(ob.reshape((1, ob.shape[0])))) # Update V and Update Policy vf_loss, vf_g = vf_lossandgrad(ob.reshape((1, ob.shape[0])), v_target, rac_alpha) vf_adam.update(vf_g, rac_alpha) pol_loss, pol_g = pol_lossandgrad(ob.reshape((1, ob.shape[0])), ac.reshape((1, ac.shape[0])), adv.reshape(adv.shape[0], ), rac_beta) compatible_feature = np.array( get_compatible_feature(ob.reshape((1, ob.shape[0])), ac.reshape((1, ac.shape[0])))) compatible_feature_product = compatible_feature * compatible_feature.T omega_t = (np.eye(compatible_feature_product.shape[0]) -0.1*rac_alpha * compatible_feature_product).dot( omega_t) \ + 0.1*rac_alpha * pol_g pol_gradients.append(omega_t) if t % update_step_threshold == 0 and t > 0: scaling_factor = [rho**(t - i) for i in range(t_0, t)] coef = t / np.sum(scaling_factor) sum_weighted_pol_gradients = np.sum([ scaling_factor[i] * pol_gradients[i] for i in range(len(scaling_factor)) ], axis=0) pol_adam.update(coef * sum_weighted_pol_gradients, rac_beta) pol_gradients = [] t_0 = t rews[i] = rew cur_ep_ret += rew cur_ep_len += 1 timesteps_so_far += 1 ob = next_ob if new: # Episode End Update if len(pol_gradients) > 0: scaling_factor = [rho**(t - i) for i in range(t_0, t)] coef = t / np.sum(scaling_factor) sum_weighted_pol_gradients = np.sum([ scaling_factor[i] * pol_gradients[i] for i in range(len(scaling_factor)) ], axis=0) pol_adam.update(coef * sum_weighted_pol_gradients, rac_beta) pol_gradients = [] t_0 = t # print( # "Episode {} - Total reward = {}, Total Steps = {}".format(episodes_so_far, cur_ep_ret, cur_ep_len)) ep_rets.append(cur_ep_ret) ep_lens.append(cur_ep_len) rewbuffer.extend(ep_rets) lenbuffer.extend(ep_lens) cur_ep_ret = 0 cur_ep_len = 0 ob = env.reset() episodes_so_far += 1 t += 1 add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values # logger.log("Optimizing...") # logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) # logger.log("Current Iteration Training Performance:" + str(np.mean(seg["ep_rets"]))) if iters_so_far == 0: result_record() iters_so_far += 1
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn( env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, sample_stochastic=False, task="train", ckpt_dir=None, save_per_iter=100, load_model_path=None, task_name=None, max_sample_traj=1500): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() if task == 'play': assert load_model_path is not None U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None and task == 'train': U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular()
def build_train_dueling(make_obs_ph, q_func, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="deepq", input_dim=84 * 84 * 4, hash_dim=32, use_rp=False, imitate=False, reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # act_f = build_act_dueling(make_obs_ph, q_func, model_func, num_actions, input_dim, hash_dim, use_rp, scope=scope, # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") if imitate: imitate_act_t_ph = tf.placeholder(tf.float32, [None, num_actions], name="imitate_action") # EMDQN value_t_ph = tf.placeholder(tf.float32, [None], name='value_t') value_tp1_ph = tf.placeholder(tf.float32, [None], name='value_tp1') # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=False) # reuse parameters from act q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", reuse=False) # reuse parameters from act # q_t_normalized = q_t - tf.max(q_t,) q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) value_tp1_residual = tf.stop_gradient(tf.reduce_max(q_tp1, axis=1)) target_q_func_vars = U.scope_vars( U.absolute_scope_name("target_q_func")) value_tp1_masked = (1.0 - done_mask_ph) * (value_tp1_ph + value_tp1_residual) # compute RHS of bellman equation q_target = rew_t_ph + gamma * value_tp1_masked # compute the error (potentially clipped) td_error = q_target - (q_t_selected + value_t_ph) td_summary = tf.summary.scalar("td error", tf.reduce_mean(td_error)) # EMDQN print(q_t.shape) if imitate: imitation_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=imitate_act_t_ph, logits=q_t), axis=1) print(imitation_loss.shape) errors = U.huber_loss(td_error) + imitation_loss else: errors = U.huber_loss(td_error) total_summary = tf.summary.scalar("total error", tf.reduce_mean(errors)) value_summary = tf.summary.scalar("value_t", tf.reduce_mean(value_t_ph)) residual_value_summary = tf.summary.scalar( "value_tp1_residual", tf.reduce_mean(value_tp1_residual)) value_tp1_summary = tf.summary.scalar("value_tp1", tf.reduce_mean(value_tp1_ph)) q_summary = tf.summary.scalar("estimated qs", tf.reduce_mean(q_t_selected)) summaries = [ td_summary, total_summary, value_summary, value_tp1_summary, residual_value_summary, q_summary ] if imitate: imitate_summary = tf.summary.scalar("imitate loss", tf.reduce_mean(imitation_loss)) summaries.append(imitate_summary) summary = tf.summary.merge(summaries) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # update_target_fn will be called periodically to copy Q network to target Q network inputs = [ obs_t_input, obs_tp1_input, act_t_ph, rew_t_ph, done_mask_ph, importance_weights_ph, value_t_ph, value_tp1_ph ] if imitate: inputs.append(imitate_act_t_ph) # Create callable functions # EMDQN train = U.function(inputs=inputs, outputs=[td_error, summary], updates=[optimize_expr]) obs_hash_output, _ = model_func(obs_t_input.get(), num_actions, scope="hash_func", reuse=False) act = U.function(inputs=[obs_t_input], outputs=[obs_hash_output]) update_target = U.function([], [], updates=[update_target_expr]) return act, train, update_target
def _init(self, ob_space, ac_space, architecture_size): """ :param ob_space: (Gym Space) The observation space of the environment :param ac_space: (Gym Space) The action space of the environment :param architecture_size: (str) size of the policy's architecture (small as in A3C paper, large as in Nature DQN) """ obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) with tf.variable_scope(self.name, reuse=self.reuse): normalized_obs = obs / 255.0 if architecture_size == 'small': # from A3C paper layer_1 = tf.nn.relu( tf_util.conv2d(normalized_obs, 16, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu( tf_util.conv2d(layer_1, 32, "l2", [4, 4], [2, 2], pad="VALID")) flattened_layer_2 = tf_util.flattenallbut0(layer_2) last_layer = tf.nn.relu( tf.layers.dense( flattened_layer_2, 256, name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) elif architecture_size == 'large': # Nature DQN layer_1 = tf.nn.relu( tf_util.conv2d(normalized_obs, 32, "l1", [8, 8], [4, 4], pad="VALID")) layer_2 = tf.nn.relu( tf_util.conv2d(layer_1, 64, "l2", [4, 4], [2, 2], pad="VALID")) layer_3 = tf.nn.relu( tf_util.conv2d(layer_2, 64, "l3", [3, 3], [1, 1], pad="VALID")) flattened_layer_3 = tf_util.flattenallbut0(layer_3) last_layer = tf.nn.relu( tf.layers.dense( flattened_layer_3, 512, name='lin', kernel_initializer=tf_util.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense( last_layer, pdtype.param_shape()[0], name='logits', kernel_initializer=tf_util.normc_initializer(0.01)) self.proba_distribution = pdtype.proba_distribution_from_flat( logits) self.vpred = tf.layers.dense( last_layer, 1, name='value', kernel_initializer=tf_util.normc_initializer(1.0))[:, 0] self.state_in = [] self.state_out = [] if self.stochastic_ph is None: self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=()) action = self.proba_distribution.sample() self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
def learn( args, env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) writer=None): ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = {} ob['adj'] = U.get_placeholder_cached(name="adj") ob['node'] = U.get_placeholder_cached(name="node") ob_gen = {} ob_gen['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_gen') ob_gen['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_gen') ob_real = {} ob_real['adj'] = U.get_placeholder( shape=[None, ob_space['adj'].shape[0], None, None], dtype=tf.float32, name='adj_real') ob_real['node'] = U.get_placeholder( shape=[None, 1, None, ob_space['node'].shape[2]], dtype=tf.float32, name='node_real') ac = tf.placeholder(dtype=tf.int64, shape=[None, 4], name='ac_real') ## PPO loss kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_logp = pi.pd.logp(ac) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = [ "mean_ppo_loss", "mean_entropy_loss", "mean_vpred_loss", "mean_kl", "mean_entropy" ] ## Expert loss loss_expert = -tf.reduce_mean(pi_logp) step_pred_real, step_logit_real = discriminator_net(ob_real, args, name='d_step') step_pred_gen, step_logit_gen = discriminator_net(ob_gen, args, name='d_step') loss_d_step_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_real, labels=tf.ones_like(step_logit_real) * 0.9)) loss_d_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) loss_d_step = loss_d_step_real + loss_d_step_gen if args.gan_type == 'normal': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.zeros_like(step_logit_gen))) elif args.gan_type == 'recommend': loss_g_step_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=step_logit_gen, labels=tf.ones_like(step_logit_gen) * 0.9)) final_pred_real, final_logit_real = discriminator_net(ob_real, args, name='d_final') final_pred_gen, final_logit_gen = discriminator_net(ob_gen, args, name='d_final') loss_d_final_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_real, labels=tf.ones_like(final_logit_real) * 0.9)) loss_d_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) loss_d_final = loss_d_final_real + loss_d_final_gen if args.gan_type == 'normal': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.zeros_like(final_logit_gen))) elif args.gan_type == 'recommend': loss_g_final_gen = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=final_logit_gen, labels=tf.ones_like(final_logit_gen) * 0.9)) var_list_pi = pi.get_trainable_variables() var_list_pi_stop = [ var for var in var_list_pi if ('emb' in var.name) or ('gcn' in var.name) or ('stop' in var.name) ] var_list_d_step = [ var for var in tf.global_variables() if 'd_step' in var.name ] var_list_d_final = [ var for var in tf.global_variables() if 'd_final' in var.name ] ## loss update function lossandgrad_ppo = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses + [U.flatgrad(total_loss, var_list_pi)]) lossandgrad_expert = U.function( [ob['adj'], ob['node'], ac, pi.ac_real], [loss_expert, U.flatgrad(loss_expert, var_list_pi)]) lossandgrad_d_step = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_step, U.flatgrad(loss_d_step, var_list_d_step)]) lossandgrad_d_final = U.function( [ob_real['adj'], ob_real['node'], ob_gen['adj'], ob_gen['node']], [loss_d_final, U.flatgrad(loss_d_final, var_list_d_final)]) loss_g_gen_step_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_step_gen) loss_g_gen_final_func = U.function([ob_gen['adj'], ob_gen['node']], loss_g_final_gen) adam_pi = MpiAdam(var_list_pi, epsilon=adam_epsilon) adam_d_step = MpiAdam(var_list_d_step, epsilon=adam_epsilon) adam_d_final = MpiAdam(var_list_d_final, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ ob['adj'], ob['node'], ac, pi.ac_real, oldpi.ac_real, atarg, ret, lrmult ], losses) # Prepare for rollouts # ---------------------------------------- episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths lenbuffer_valid = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_env = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_step = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_d_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer_final_stat = deque( maxlen=100) # rolling buffer for episode rewardsn seg_gen = traj_segment_generator(args, pi, env, timesteps_per_actorbatch, True, loss_g_gen_step_func, loss_g_gen_final_func) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if args.load == 1: try: fname = './ckpt/' + args.name_full_load sess = tf.get_default_session() saver = tf.train.Saver(var_list_pi) saver.restore(sess, fname) iters_so_far = int(fname.split('_')[-1]) + 1 print('model restored!', fname, 'iters_so_far:', iters_so_far) except: print(fname, 'ckpt not found, start with iters 0') U.initialize() adam_pi.sync() adam_d_step.sync() adam_d_final.sync() level = 0 ## start training while True: if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob_adj, ob_node, ac, atarg, tdlamret = seg["ob_adj"], seg[ "ob_node"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob_adj=ob_adj, ob_node=ob_node, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob_adj.shape[0] # inner training loop, train policy for i_optim in range(optim_epochs): loss_expert = 0 g_expert = 0 g_expert_stop = 0 loss_d_step = 0 loss_d_final = 0 g_ppo = 0 g_d_step = 0 g_d_final = 0 pretrain_shift = 5 ## Expert if iters_so_far >= args.expert_start and iters_so_far <= args.expert_end + pretrain_shift: ## Expert train # # # learn how to stop ob_expert, ac_expert = env.get_expert(optim_batchsize) loss_expert, g_expert = lossandgrad_expert( ob_expert['adj'], ob_expert['node'], ac_expert, ac_expert) loss_expert = np.mean(loss_expert) ## PPO if iters_so_far >= args.rl_start and iters_so_far <= args.rl_end: assign_old_eq_new( ) # set old parameter values to new parameter values batch = d.next_batch(optim_batchsize) # ppo if iters_so_far >= args.rl_start + pretrain_shift: # start generator after discriminator trained a well.. *newlosses, g_ppo = lossandgrad_ppo( batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) if args.has_d_step == 1 and i_optim >= optim_epochs // 2: # update step discriminator ob_expert, _ = env.get_expert( optim_batchsize, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) loss_d_step, g_d_step = lossandgrad_d_step( ob_expert["adj"], ob_expert["node"], batch["ob_adj"], batch["ob_node"]) adam_d_step.update(g_d_step, optim_stepsize * cur_lrmult) loss_d_step = np.mean(loss_d_step) if args.has_d_final == 1 and i_optim >= optim_epochs // 4 * 3: # update final discriminator ob_expert, _ = env.get_expert( optim_batchsize, is_final=True, curriculum=args.curriculum, level_total=args.curriculum_num, level=level) seg_final_adj, seg_final_node = traj_final_generator( pi, copy.deepcopy(env), optim_batchsize, True) # update final discriminator loss_d_final, g_d_final = lossandgrad_d_final( ob_expert["adj"], ob_expert["node"], seg_final_adj, seg_final_node) adam_d_final.update(g_d_final, optim_stepsize * cur_lrmult) # update generator adam_pi.update(0.2 * g_ppo + 0.05 * g_expert, optim_stepsize * cur_lrmult) ## PPO val losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob_adj"], batch["ob_node"], batch["ac"], batch["ac"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) if writer is not None: writer.add_scalar("loss_teacher_forcing", loss_expert, iters_so_far) writer.add_scalar("loss_d_step", loss_d_step, iters_so_far) writer.add_scalar("loss_d_final", loss_d_final, iters_so_far) writer.add_scalar('grad_expert_min', np.amin(g_expert), iters_so_far) writer.add_scalar('grad_expert_max', np.amax(g_expert), iters_so_far) writer.add_scalar('grad_expert_norm', np.linalg.norm(g_expert), iters_so_far) writer.add_scalar('grad_expert_stop_min', np.amin(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_max', np.amax(g_expert_stop), iters_so_far) writer.add_scalar('grad_expert_stop_norm', np.linalg.norm(g_expert_stop), iters_so_far) writer.add_scalar('grad_rl_min', np.amin(g_ppo), iters_so_far) writer.add_scalar('grad_rl_max', np.amax(g_ppo), iters_so_far) writer.add_scalar('grad_rl_norm', np.linalg.norm(g_ppo), iters_so_far) writer.add_scalar('g_d_step_min', np.amin(g_d_step), iters_so_far) writer.add_scalar('g_d_step_max', np.amax(g_d_step), iters_so_far) writer.add_scalar('g_d_step_norm', np.linalg.norm(g_d_step), iters_so_far) writer.add_scalar('g_d_final_min', np.amin(g_d_final), iters_so_far) writer.add_scalar('g_d_final_max', np.amax(g_d_final), iters_so_far) writer.add_scalar('g_d_final_norm', np.linalg.norm(g_d_final), iters_so_far) writer.add_scalar('lr', optim_stepsize * cur_lrmult, iters_so_far) for (lossval, name) in zipsame(meanlosses, loss_names): if writer is not None: writer.add_scalar("loss_" + name, lossval, iters_so_far) if writer is not None: writer.add_scalar("ev_tdlam_before", explained_variance(vpredbefore, tdlamret), iters_so_far) lrlocal = (seg["ep_lens"], seg["ep_lens_valid"], seg["ep_rets"], seg["ep_rets_env"], seg["ep_rets_d_step"], seg["ep_rets_d_final"], seg["ep_final_rew"], seg["ep_final_rew_stat"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, lens_valid, rews, rews_env, rews_d_step, rews_d_final, rews_final, rews_final_stat = map( flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) lenbuffer_valid.extend(lens_valid) rewbuffer.extend(rews) rewbuffer_d_step.extend(rews_d_step) rewbuffer_d_final.extend(rews_d_final) rewbuffer_env.extend(rews_env) rewbuffer_final.extend(rews_final) rewbuffer_final_stat.extend(rews_final_stat) if writer is not None: writer.add_scalar("EpLenMean", np.mean(lenbuffer), iters_so_far) writer.add_scalar("EpLenValidMean", np.mean(lenbuffer_valid), iters_so_far) writer.add_scalar("EpRewMean", np.mean(rewbuffer), iters_so_far) writer.add_scalar("EpRewDStepMean", np.mean(rewbuffer_d_step), iters_so_far) writer.add_scalar("EpRewDFinalMean", np.mean(rewbuffer_d_final), iters_so_far) writer.add_scalar("EpRewEnvMean", np.mean(rewbuffer_env), iters_so_far) writer.add_scalar("EpRewFinalMean", np.mean(rewbuffer_final), iters_so_far) writer.add_scalar("EpRewFinalStatMean", np.mean(rewbuffer_final_stat), iters_so_far) writer.add_scalar("EpThisIter", len(lens), iters_so_far) episodes_so_far += len(lens) timesteps_so_far += sum(lens) if writer is not None: writer.add_scalar("EpisodesSoFar", episodes_so_far, iters_so_far) writer.add_scalar("TimestepsSoFar", timesteps_so_far, iters_so_far) writer.add_scalar("TimeElapsed", time.time() - tstart, iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: with open('molecule_gen/' + args.name_full + '.csv', 'a') as f: f.write('***** Iteration {} *****\n'.format(iters_so_far)) # save if iters_so_far % args.save_every == 0: fname = './ckpt/' + args.name_full + '_' + str(iters_so_far) saver = tf.train.Saver(var_list_pi) saver.save(tf.get_default_session(), fname) print('model saved!', fname) iters_so_far += 1 if iters_so_far % args.curriculum_step and iters_so_far // args.curriculum_step < args.curriculum_num: level += 1
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, num_options=2, dc=0): assert isinstance(ob_space, gym.spaces.Box) # define action and observation space self.ac_space_dim = ac_space.shape[0] self.ob_space_dim = ob_space.shape[0] self.dc = dc self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32) self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32) self.num_options = num_options self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None]) # create a filter for the pure shape, meaning excluding u[k-1] obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), ) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope("obfilter_pure"): self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) obz_pure = tf.clip_by_value( (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) / self.ob_rms_only.std, -5.0, 5.0) # implement Q-function approximation last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "vffc0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "vffc1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "vfff0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "vfff1", weight_init=U.normc_initializer(1.0)) # return the Q-function value self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0] # implement parametrizatzion for policy over options last_out0 = obz # for option 0 last_out1 = obz_pure # for option 1 for i in range(num_hid_layers): last_out0 = tf.nn.relu( U.dense(last_out0, hid_size, "oppi0%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out1 = tf.nn.relu( U.dense(last_out1, hid_size, "oppi1%i" % (i + 1), weight_init=U.normc_initializer(1.0))) last_out0 = U.dense(last_out0, 1, "oppif0", weight_init=U.normc_initializer(1.0)) last_out1 = U.dense(last_out1, 1, "oppif1", weight_init=U.normc_initializer(1.0)) last_out = tf.concat([last_out0, last_out1], 1) # return probabilities for the options self.op_pi = tf.nn.softmax(last_out) # always terminate self.tpred = tf.nn.sigmoid( dense3D2(tf.stop_gradient(last_out), 1, "termhead", option, num_options=num_options, weight_init=U.normc_initializer(1.0)))[:, 0] termination_sample = tf.constant([True]) # define the control policy / intra-option policy last_out = obz_pure for i in range(num_hid_layers): last_out = tf.nn.relu( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense3D2(last_out, pdtype.param_shape()[0] // 2, "polfinal", option, num_options=num_options, weight_init=U.normc_initializer(0.01), bias=False) # now also use relus to squash to -1,1 mean = (-tf.nn.relu(-(mean - 1)) + tf.nn.relu(-(mean + 1))) + 1 logstd = tf.get_variable( name="logstd", shape=[num_options, 1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # sample stochastically -> this corresponds to exploration stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # choose the appropriate action, apply the ZOH if using option 0 ac = U.switch(option[0], ac, tf.stop_gradient(ob[:, -self.ac_space_dim:])) ac = tf.clip_by_value(ac, -1.0, 1.0) self.last_action = tf.stop_gradient(ac) self._act = U.function([stochastic, ob, option], [ac, self.vpred, last_out, logstd]) self._get_v = U.function([ob, option], [self.vpred]) self.get_term = U.function([ob, option], [termination_sample]) self.get_tpred = U.function([ob, option], [self.tpred]) self.get_vpred = U.function([ob, option], [self.vpred]) self._get_op = U.function([ob], [self.op_pi])
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t")) act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1")) done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = U.scope_vars(U.absolute_scope_name("q_func")) # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func")) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, weighted_error, var_list=q_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, old_qmin = -100, old_qmax = 100, nbins = 200, new_qmin = -100, new_qmax = 100, double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) print("build_train::num_actions: ", num_actions) #OK with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation #q_t,q_t2D = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # (1D num_actions* bins,2D num_actions,values) q_t2D = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # (1D num_actions* bins,2D num_actions,values) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution #_,q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") # (1D num_actions* bins,2D num_actions,values) q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") # (1D num_actions* bins,2D num_actions,values) target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state Q(\phi_j, a_j) print("tf.shape(act_t_ph): ", tf.shape(act_t_ph)) #print("q_t.get_shape()): ", q_t.get_shape()) print("q_t2D.get_shape()): ", q_t2D.get_shape()) print("act_t_ph.get_shape()): ", act_t_ph.get_shape()) #print("size.get_shape()): ", size.get_shape()) logits_list = [] for i in range(32): logits_list.append( q_t2D[i,act_t_ph[i],:]) logits = tf.stack(logits_list) print("logits.get_shape()): ", logits.get_shape()) #logits[i,:] = q_t2D[i,act_t_ph[i],:] #logits = q_t2D[:,0,:] #size = tf.ones([act_t_ph.shape[0]],dtype=tf.int32 )*nbins #logits = tf.slice(q_t, act_t_ph, size) #sel_act = tf.expand_dims(tf.one_hot(act_t_ph, num_actions),2) #print("sel_act.get_shape(): ", sel_act.get_shape()) # in order to create a mask with multiple ones for given action we first create a one mask and tile #sel = tf.tile(sel_act, [1,1,nbins]) # we multiply mask with number of bins #print("sel.get_shape(): ", sel.get_shape()) #sel_act = tf.reshape(sel , [tf.shape(sel)[0],tf.shape(sel)[1]*tf.shape(sel)[2]]) #print("sel_act.get_shape(): ", sel_act.get_shape()) #q_t_selected = q_t * sel_act # we select the action with all bins #print("q_t_selected.get_shape(): ", q_t_selected.get_shape()) # compute estimate of best possible value starting from state at t + 1 #if double_q: # q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) # q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) # q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) #else: # # bin of max_{a'}(Q(\phi_{j+1), a') b_tp1_best = tf.reduce_max(tf.cast(tf.argmax(q_tp1, 2),tf.float32),1) # we choose highest next Q bin print("b_tp1_best.get_shape(): ", b_tp1_best.get_shape()) # compute RHS of bellman equation dbin = (new_qmax-new_qmin)/nbins #delta_bin # bin2Q -> max_{a'}(Q(\phi_{j+1), a') q_tp1_best_new = (b_tp1_best*dbin + dbin/2) + new_qmin q_tp1_best_old = old_qmin + (q_tp1_best_new - new_qmin) * (old_qmax - old_qmin) /(new_qmax - new_qmin); q_tp1_best_old = (1.0 - done_mask_ph) * q_tp1_best_old q_tp1_best_old = rew_t_ph + gamma * (q_tp1_best_old)# target Q value q_tp1_best_new = new_qmin + (q_tp1_best_old - old_qmin) * (new_qmax - new_qmin) /(old_qmax - old_qmin); bin_target = tf.cast(( (tf.clip_by_value(q_tp1_best_new,new_qmin,new_qmax) - (new_qmin)) // dbin),tf.int32)# label_bin: Q2bin(target) #q_t_selected_target = tf.one_hot(act_t_ph*nbins +q_t_val,nbins*num_actions) # convert it to one hot encoding # compute the error (potentially clipped) new_errors = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=bin_target) #new_errors = tf.nn.softmax_cross_entropy_with_logits(labels =q_t_selected_target, logits = q_t_selected ) # cross entropy weighted_error = tf.reduce_mean(importance_weights_ph * new_errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function( inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[new_errors], updates=[optimize_expr] ) val = U.function( # this is added only to monitor if values are calculated correctly inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=[new_errors], #,q_t,q_tp1, q_t_selected , q_t_selected_target , q_tp1_best ,tot_val,q_t_val ] ) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t2D) return act_f, train, update_target, {'q_values': q_values} , val
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None): """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905): Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that take a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable function to select and action given observation. ` See the top of the file for details. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter with tf.variable_scope(scope, reuse=reuse): observations_ph = U.ensure_tf_input(make_obs_ph("observation")) stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic") update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps") update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold") update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale") reset_ph = tf.placeholder(tf.bool, (), name="reset") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False) param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False) # Unmodified Q. q_values = q_func(observations_ph.get(), num_actions, scope="q_func") # Perturbable Q used for the actual rollout. q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func") # We have to wrap this code into a function due to the way tf.cond() works. See # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for # a more detailed discussion. def perturb_vars(original_scope, perturbed_scope): all_vars = U.scope_vars(U.absolute_scope_name("q_func")) all_perturbed_vars = U.scope_vars(U.absolute_scope_name("perturbed_q_func")) assert len(all_vars) == len(all_perturbed_vars) perturb_ops = [] for var, perturbed_var in zip(all_vars, all_perturbed_vars): if param_noise_filter_func(perturbed_var): # Perturb this variable. op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale)) else: # Do not perturb, just assign. op = tf.assign(perturbed_var, var) perturb_ops.append(op) assert len(perturb_ops) == len(all_vars) return tf.group(*perturb_ops) # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy # of the network and measures the effect of that perturbation in action space. If the perturbation # is too big, reduce scale of perturbation, otherwise increase. q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func") perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func") kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1) mean_kl = tf.reduce_mean(kl) def update_scale(): with tf.control_dependencies([perturb_for_adaption]): update_scale_expr = tf.cond(mean_kl < param_noise_threshold, lambda: param_noise_scale.assign(param_noise_scale * 1.01), lambda: param_noise_scale.assign(param_noise_scale / 1.01), ) return update_scale_expr # Functionality to update the threshold for parameter space noise. update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold)) # Put everything together. deterministic_actions = tf.argmax(q_values_perturbed, axis=1) batch_size = tf.shape(observations_ph.get())[0] random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64) chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions) update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps)) updates = [ update_eps_expr, tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])), tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)), update_param_noise_threshold_expr, ] act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph], outputs=output_actions, givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False}, updates=updates) return act
def learn_with_human( env, test_env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) success_reward=10000, save_path='model/new_model', data_queue=None): # Setup losses and stuff # ---------------------------------------- rew_mean = [] if hasattr(env.observation_space, 'spaces'): ob_space = env.observation_space.spaces['observation'] else: ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts seg_gen = traj_segment_generator_perturb(pi, env, timesteps_per_batch, stochastic=True, coeff=0.2, q=data_queue) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards test_interval = 10 eval_interval = 5 test_start = 0 eval_start = 0 assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) print('training part......') seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values print('optimize for %d epochs' % optim_epochs) for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) if eval_start % eval_interval == 0: print('evaluation part......') curr_rew = evaluate(pi, test_env) rew_mean.append(curr_rew) print('evalution reward: ', curr_rew) if test_start % test_interval == 0: print('testing part.......') test_rew = test_random(pi, test_env, True, 0.3, data_queue) print('test reward: ', test_rew) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) if len(lens) != 0: rew_mean.append(np.mean(rewbuffer)) timesteps_so_far += sum(lens) iters_so_far += 1 test_start += 1 eval_start += 1 return rew_mean
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Dict) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob_config = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['joint'].shape)) ob_target = U.get_placeholder(name="goal", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['target'].shape)) obs_pos = U.get_placeholder( name="obs_pos", dtype=tf.float32, shape=[sequence_length] + list(ob_space.spaces['obstacle_pos1'].shape)) #is_training = U.get_placeholder(name="bn_training", dtype=tf.bool, shape=()) # construct v function model '''with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space['joint'].shape) obz = tf.clip_by_value((ob_config - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz goal_last_out = tf.clip_by_value((ob_target - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)''' last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "vfcfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="vfcbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "vfgfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="vfgbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "vfobsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="vfobn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) vpred = tf.concat([last_out, goal_last_out, obs_last_out], -1) self.vpred = dense(vpred, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # construct policy probability distribution model last_out = ob_config goal_last_out = ob_target obs_last_out = obs_pos for i in range(num_hid_layers): last_out = dense(last_out, hid_size, "pol_cfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #last_out = tf.layers.batch_normalization(last_out, training=is_training, name="pol_cbn%i"%(i+1)) last_out = tf.nn.tanh(last_out) goal_last_out = dense(goal_last_out, hid_size, "pol_gfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #goal_last_out = tf.layers.batch_normalization(goal_last_out, training=is_training, name="pol_gbn%i" % (i + 1)) goal_last_out = tf.nn.tanh(goal_last_out) obs_last_out = dense(obs_last_out, hid_size, "pol_obsfc%i" % (i + 1), weight_init=U.normc_initializer(1.0), weight_loss_dict={}) #obs_last_out = tf.layers.batch_normalization(obs_last_out, training=is_training, name="pol_obn%i"%(i+1)) obs_last_out = tf.nn.tanh(obs_last_out) last_out = tf.concat([last_out, goal_last_out, obs_last_out], -1) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.constant_initializer( [0.2, 0.2, -1., -1.])) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob_config, ob_target, obs_pos], [ac, self.vpred])
def build_train_contrast(make_obs_ph, model_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, scope="mfec", latent_dim=32, alpha=0.05, beta=0.1, theta=0.1, loss_type=["contrast"], c_loss_type="sqmargin", reuse=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # z_func = build_act_contrast(make_obs_ph, model_func, num_actions, scope=scope, secondary_scope="model_func", # reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders # EMDQN # tau = tf.placeholder(tf.float32, [1], name='tau') # momentum = tf.placeholder(tf.float32, [1], name='momentum') obs_input_query = U.ensure_tf_input(make_obs_ph("obs_query")) obs_input_positive = U.ensure_tf_input(make_obs_ph("enc_obs_pos")) obs_input_negative = U.ensure_tf_input(make_obs_ph("enc_obs_neg")) value_input_query = tf.placeholder(tf.float32, [None], name="value") action_embedding = tf.Variable(tf.random_normal( [num_actions, latent_dim], stddev=1), name="action_embedding") action_input = tf.placeholder(tf.int32, [None], name="action") inputs = [obs_input_query] if "contrast" in loss_type: inputs += [obs_input_positive, obs_input_negative] if "regression" in loss_type: inputs += [value_input_query] if "linear_model" in loss_type: inputs += [action_input] if "contrast" not in loss_type: inputs += [obs_input_positive] z = model_func(obs_input_query.get(), num_actions, scope="model_func", reuse=tf.AUTO_REUSE) h = model_func(obs_input_query.get(), num_actions, scope="hash_func", reuse=False) # _, v = model_func( # obs_input_query.get(), num_actions, # scope="model_func", # reuse=True) z_pos = model_func(obs_input_positive.get(), num_actions, scope="model_func", reuse=True) z_neg = model_func(obs_input_negative.get(), num_actions, scope="model_func", reuse=True) z_pos = tf.reshape(z_pos, [-1, latent_dim]) z_tar = tf.reshape(z, [-1, latent_dim]) z_neg = tf.reshape(z_neg, [-1, latent_dim]) contrast_loss = contrastive_loss_fc(z_tar, z_pos, z_neg, c_type=c_loss_type) regression_loss = tf.reduce_mean( tf.squared_difference(tf.norm(z_tar, axis=1), alpha * value_input_query)) action_embeded = tf.matmul(tf.one_hot(action_input, num_actions), action_embedding) model_loss = tf.reduce_mean( tf.squared_difference(action_embeded + z_tar, z_pos)) print("shape:", z_tar.shape, z_pos.shape, z_neg.shape, action_embeded.shape) # contrast_loss = tf.reduce_mean(tf.log(sum_negative) - positive) # print("shape2:", z.shape, negative.shape, positive.shape) # prediction_loss = tf.losses.mean_squared_error(value_input, v) total_loss = 0 if "contrast" in loss_type: total_loss += contrast_loss if "regression" in loss_type: total_loss += beta * regression_loss elif "linear_model" in loss_type: total_loss += theta * model_loss model_func_vars = U.scope_vars(U.absolute_scope_name("model_func")) if "linear_model" in loss_type: model_func_vars.append(action_embedding) if grad_norm_clipping is not None: optimize_expr = U.minimize_and_clip(optimizer, total_loss, var_list=model_func_vars, clip_val=grad_norm_clipping) else: optimize_expr = optimizer.minimize(total_loss, var_list=model_func_vars) # Create callable functions # update_target_fn will be called periodically to copy Q network to target Q network z_var_summary = tf.summary.scalar( "z_var", tf.reduce_mean(tf.math.reduce_std(z, axis=1))) negative_summary = tf.summary.scalar( "negative_dist", tf.reduce_mean(emb_dist(z_tar, z_neg))) positive_summary = tf.summary.scalar( "positive_dist", tf.reduce_mean(emb_dist(z_tar, z_pos))) contrast_loss_summary = tf.summary.scalar( "contrast loss", tf.reduce_mean(contrast_loss)) regression_loss_summary = tf.summary.scalar( "regression loss", tf.reduce_mean(contrast_loss)) model_loss_summary = tf.summary.scalar("model loss", tf.reduce_mean(contrast_loss)) # prediction_loss_summary = tf.summary.scalar("prediction loss", tf.reduce_mean(prediction_loss)) total_loss_summary = tf.summary.scalar("total loss", tf.reduce_mean(total_loss)) summaries = [z_var_summary, total_loss_summary] if "contrast" in loss_type: summaries += [ negative_summary, positive_summary, contrast_loss_summary ] if "regression" in loss_type: summaries.append(regression_loss_summary) if "linear_model" in loss_type: summaries.append(model_loss_summary) summary = tf.summary.merge(summaries) outputs = [z_tar] if "contrast" in loss_type: outputs += [z_pos, z_neg] elif "linear_model" in loss_type: outputs += [z_pos] outputs += [total_loss, summary] train = U.function(inputs=inputs, outputs=outputs, updates=[optimize_expr]) eval = U.function(inputs=inputs, outputs=outputs, updates=[]) z_func = U.function( inputs=[obs_input_query], outputs=[z, h], ) norm_func = U.function(inputs=[obs_input_query], outputs=[tf.norm(z_tar, axis=1)]) return z_func, train, eval, norm_func