def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i, do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a, b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l, g = lossandgrad() adam.update(g, stepsize) print(i, l)
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M: Mval}) logliks = calcloglik(Xval, Mval) entval_ll = -logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = -entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
def __init__(self, env, hidden_size, sequence_size, attention_size, cell_type, entcoeff=0.001, lr_rate=0.0, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.action_shape = env.action_space.shape self.num_observations = self.observation_shape[0] self.num_actions = self.action_shape[0] self.embedding_size = self.num_observations + self.num_actions self.hidden_size = hidden_size self.sequence_size = sequence_size self.attention_size = attention_size self.cell_type = cell_type self.build_ph() #Build graph generator_logits, self.rewards_op = self.build_graph( self.generator_traj_ph, self.generator_traj_seq_len, reuse=False) expert_logits, _ = self.build_graph(self.expert_traj_ph, self.expert_traj_seq_len, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss var_list = self.get_trainable_variables() self.lossandgrad = U.function([ self.generator_traj_ph, self.generator_traj_seq_len, self.expert_traj_ph, self.expert_traj_seq_len, self.dropout_keep_prob ], self.losses + [U.flatgrad(self.total_loss, var_list)])
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, env, hidden_size, discriminatorStepSize=3e-4, entcoeff=0.001, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([o+a for o,a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.discriminatorStepSize = discriminatorStepSize self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) gl = self.get_trainable_variables() # Build accuracy generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff*entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], self.losses + [U.flatgrad(self.total_loss, var_list)]) self.get_expert_logits = U.function([self.expert_obs_ph, self.expert_acs_ph], expert_logits) self.get_logits = U.function( [self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], [expert_logits] + [generator_logits])
def learn(args, env, policy_func, dataset, optim_batch_size=128, adam_epsilon=1e-5, optim_stepsize=3e-4): # ============================== INIT FROM ARGS ================================== max_iters = args.BC_max_iter pretrained = args.pretrained ckpt_dir = args.checkpoint_dir log_dir = args.log_dir task_name = args.task_name val_per_iter = int(max_iters / 10) pi = policy_func(args, "pi", env) # Construct network for new policy oldpi = policy_func(args, "oldpi", env) # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters + 1))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, max_to_keep=args.max_to_keep) return savedir_fname
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) #エキスパート行動と方策行動の差の2乗の平均 var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) #状態,行動,確率的方策(bool)を入力,loss(エキスパート行動と方策行動の差の2乗の平均)andその勾配を出力 if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable(dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False ) #_count = 0.01, mei you shape yi si shi mei shape wei 1 self.shape = shape self.mean = tf.to_float(self._sum / self._count) #print ("this is self.mean", self.mean) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) #max(1-n^2, 0.01) zheng ge std d zhi >0.1 #print ("this is self.std", self.std) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function( [newsum, newsumsq, newcount], [], updates=[ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ] ) #ba assing_add(1, 2) di or ge zhi zeng jia dao di yi ge zhi shang mian
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable(dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function( [newsum, newsumsq, newcount], [], updates=[ tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount) ])
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_proba_dist_type(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.proba_distribution_from_flat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
def __init__(self, env, hidden_size, discriminatorStepSize=3e-4, entcoeff=0.001, scope="adversary"): global old_gen_loss, old_exp_loss print("Init Wasserstein discriminator") self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([ o + a for o, a in zip(self.observation_shape, self.actions_shape) ]) self.num_actions = env.action_space.n if isinstance( env.action_space, Discrete) else env.action_space.shape[0] self.hidden_size = hidden_size self.discriminatorStepSize = discriminatorStepSize # PLACEHOLDERS self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph") self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph") self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph") self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph") # Build graph gen_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) exp_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(gen_logits) < 0.5)) expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(exp_logits) > 0.5)) # regression losses to control progress: old_gen_loss = regression_loss(gen_logits) old_exp_loss = regression_loss(exp_logits) # NR1. Use Wasserstein loss discriminator_loss = tf.contrib.gan.losses.wargs.wasserstein_discriminator_loss( exp_logits, gen_logits) # --- not sure about this loss function, but it doesn't take part in calculations: generator_loss = -tf.reduce_mean(gen_logits) # Build entropy loss logits = tf.concat([gen_logits, exp_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, discriminator_loss, old_gen_loss, old_exp_loss, entropy, entropy_loss, generator_acc, expert_acc, discriminator_loss + entropy_loss ] self.loss_name = [ "gen_loss", "disc_loss", "old_gen_loss", "old_exp_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc", "total_loss" ] self.total_loss = discriminator_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1 - tf.nn.sigmoid(gen_logits) + 1e-8) # NR2. Use RMSPropOptimizer self.optimizer = tf.train.RMSPropOptimizer( learning_rate=discriminatorStepSize).minimize( self.total_loss, var_list=self.get_trainable_variables()) # NR3. Clip weights in range [-.01, .01] clip_ops = [] for var in self.get_trainable_variables(): clip_bounds = [-.01, .01] clip_ops.append( tf.assign( var, tf.clip_by_value(var, clip_bounds[0], clip_bounds[1]))) self.clip_disc_weights = tf.group(*clip_ops) self.dict = [ self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph ] # ================================ FUNCTIONS ===================================== self.disc_train_op = U.function(self.dict, self.optimizer) self.losses = U.function(self.dict, self.losses) self.get_expert_logits = U.function( [self.expert_obs_ph, self.expert_acs_ph], exp_logits) self.get_logits = U.function(self.dict, [exp_logits] + [gen_logits]) self.clip = U.function(self.dict, self.clip_disc_weights)
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e3, adam_epsilon=1e-6, optim_stepsize=2e-4, ckpt_dir=None, log_dir=None, task_name=None, high_level=False): val_per_iter = int(max_iters / 100) ob_space = env.observation_space ac_space = env.action_space start_time = time.time() if not high_level: pi_low = policy_func("pi_low", ob_space, ac_space.spaces[1]) # placeholder # ob_low = U.get_placeholder_cached(name="ob") ob_low = pi_low.ob ac_low = pi_low.pdtype.sample_placeholder([None]) # stochastic_low = U.get_placeholder_cached(name="stochastic") stochastic_low = pi_low.stochastic loss_low = tf.reduce_mean(tf.square(ac_low - pi_low.ac)) var_list_low = pi_low.get_trainable_variables() adam_low = MpiAdam(var_list_low, epsilon=adam_epsilon) lossandgrad_low = U.function([ob_low, ac_low, stochastic_low], [loss_low] + [U.flatgrad(loss_low, var_list_low)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats_low = stats(["Loss_low"]) U.initialize() adam_low.sync() logger.log("Pretraining with Behavior Cloning Low...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch( optim_batch_size, 'train', high_level) loss, g = lossandgrad_low(ob_expert, ac_expert, True) adam_low.update(g, optim_stepsize) if not pretrained: ep_stats_low.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch( -1, 'val', high_level) loss, g = lossandgrad_low(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi_low.get_variables()) return savedir_fname else: pi_high = policy_func("pi_high", ob_space, ac_space.spaces[0]) # high -> action_label # ob_high = U.get_placeholder_cached(name="ob") ob_high = pi_high.ob ac_high = pi_high.pdtype.sample_placeholder([None, 1]) onehot_labels = tf.one_hot(indices=tf.cast(ac_high, tf.int32), depth=3) # stochastic_high = U.get_placeholder_cached(name="stochastic") stochastic_high = pi_high.stochastic cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=pi_high.logits, labels=onehot_labels) loss_high = tf.reduce_mean(cross_entropy) var_list_high = pi_high.get_trainable_variables() adam_high = MpiAdam(var_list_high, epsilon=adam_epsilon) lossandgrad_high = U.function([ob_high, ac_high, stochastic_high], [loss_high] + [U.flatgrad(loss_high, var_list_high)]) # train high level policy if not pretrained: writer = U.FileWriter(log_dir) # ep_stats_low = stats(["Loss_low"]) ep_stats_high = stats(["loss_high"]) U.initialize() adam_high.sync() logger.log("Pretraining with Behavior Cloning High...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch( optim_batch_size, 'train', high_level) loss, g = lossandgrad_high(ob_expert, ac_expert, True) adam_high.update(g, optim_stepsize) if not pretrained: ep_stats_high.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch( -1, 'val', high_level) loss, g = lossandgrad_high(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi_high.get_variables()) return savedir_fname print("--- %s seconds ---" % (time.time() - start_time))
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] # last_out = obz # for i in range(num_hid_layers): # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0))) ### add conv net instead of using dense self.msize = 64 # change to 64 later self.ssize = 64 self.isize = 11 self.available_action_size = 524 minimap = obz[:, 0:5 * self.msize * self.msize] screen = obz[:, 5 * self.msize * self.msize:5 * self.msize * self.msize + 10 * self.ssize * self.ssize] info = obz[:, (5 * self.msize * self.msize + 10 * self.ssize * self.ssize):( 5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize)] available_action = obz[:, (5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize):(5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize + self.available_action_size)] conv1_minimap = tf.layers.conv2d(inputs=tf.reshape( minimap, [-1, self.msize, self.msize, 5]), filters=10, kernel_size=5, strides=1, padding='same', activation=tf.nn.leaky_relu, name="polmconv1") # -> (64, 64, 10) pool1_minimap = tf.layers.max_pooling2d( conv1_minimap, pool_size=4, strides=4, name="polmpool1") # -> (16, 16, 10) conv2_minimap = tf.layers.conv2d(pool1_minimap, 10, 5, 1, 'same', activation=tf.nn.relu, name="polmconv2") # -> (16, 16, 10) pool2_minimap = tf.layers.max_pooling2d( conv2_minimap, 2, 2, name="polmpool2") # -> (8, 8, 10) flat_minimap = tf.reshape(pool2_minimap, [-1, 8 * 8 * 10]) # -> (8*8*10, ) # dense_minimap = tf.layers.dense(inputs=flat_minimap, units=1024, activation=tf.nn.relu) # # dropout_mininmap = tf.layers.dropout( # # inputs=dense_minimap, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # minimap_output = tf.layers.dense(dense_minimap, 64) conv1_screen = tf.layers.conv2d( inputs=tf.reshape(screen, [-1, self.ssize, self.ssize, 10]), # (64,64,10) filters=20, kernel_size=5, strides=1, padding='same', activation=tf.nn.leaky_relu, name="polsconv1") # -> (64, 64, 20) pool1_screen = tf.layers.max_pooling2d( conv1_screen, pool_size=4, strides=4, name="polspool1") # -> (16, 16, 20) conv2_screen = tf.layers.conv2d(pool1_screen, 20, 5, 1, 'same', activation=tf.nn.relu, name="polsconv2") # -> (16, 16, 20) pool2_screen = tf.layers.max_pooling2d( conv2_screen, 2, 2, name="polspool2") # -> (8, 8, 20) flat_screen = tf.reshape(pool2_screen, [-1, 8 * 8 * 20]) # -> (8*8*20, ) # dense_screen = tf.layers.dense(inputs=flat_screen, units=1024, activation=tf.nn.relu) # # dropout_screen = tf.layers.dropout( # # inputs=dense_screen, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN) # screen_output = tf.layers.dense(dense_screen, 64, tf.nn.relu) info_fc = tf.layers.dense(inputs=layers.flatten(info), units=4, activation=tf.tanh, name="poldense1") aa_fc = tf.layers.dense(inputs=layers.flatten(available_action), units=16, activation=tf.tanh, name="poldense2") last_out = tf.concat([flat_minimap, flat_screen, info_fc, aa_fc], axis=1, name="polconcat") # last_out = tf.layers.dense(inputs=last_out,units=600,name="poldense3") # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc1", weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn( env, policy_func, discriminator, expert_dataset, pretrained, pretrained_weight, *, g_step, d_step, episodes_per_batch, # what to train on dropout_keep_prob, sequence_size, #rnn parameters max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, episodes_per_batch, stochastic=True, seq_length=sequence_size) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) traj_gen, traj_len_gen = seg["ep_trajs"], seg["ep_lens"] #traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch() batch_size = len(traj_gen) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for traj_batch, traj_len_batch in dataset.iterbatches( (traj_gen, traj_len_gen), include_final_partial_batch=False, batch_size=batch_size): traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch( len(traj_batch)) # update running mean/std for discriminator ob_batch, _ = traj2trans(traj_batch, traj_len_batch, ob_space.shape[0]) ob_expert, _ = traj2trans(traj_expert, traj_len_expert, ob_space.shape[0]) if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(traj_batch, traj_len_batch, traj_expert, traj_len_expert, dropout_keep_prob) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) last_action = U.get_placeholder(shape=(None, 524), dtype=tf.float32, name="last_action_one_hot") self.msize = 64 # change to 64 later self.ssize = 64 self.isize = 11 self.available_action_size = 524 available_action = ob[:, (5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize+self.available_action_size)] # ob = ob[:,:-(self.available_action_size)] with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0) obz = (ob - self.ob_rms.mean) / self.ob_rms.std minimap = obz[:, 0:5*self.msize*self.msize] # minimap /= 2 screen = obz[:, 5*self.msize*self.msize: 5*self.msize*self.msize+ 10*self.ssize*self.ssize] # screen /= 2 info = obz[:, (5*self.msize*self.msize+10*self.ssize*self.ssize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize)] # info /= 2 # get value prediction, crtic mconv1 = tf.layers.conv2d( inputs=tf.reshape(minimap, [-1,self.msize,self.msize,5]), filters=32, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) mpool1 = tf.layers.max_pooling2d(inputs=mconv1, pool_size=[2, 2], strides=2) mconv2 = tf.layers.conv2d( inputs=mpool1, filters=64, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu, name="vffcmconv2") mpool2 = tf.layers.max_pooling2d(inputs=mconv2, pool_size=[2, 2], strides=2) mpool2_flat = tf.reshape(mpool2, [-1, 16 * 16 * 64]) sconv1 = tf.layers.conv2d( inputs=tf.reshape(screen, [-1,self.ssize, self.ssize,10]), filters=48, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) spool1 = tf.layers.max_pooling2d(inputs=sconv1, pool_size=[2, 2], strides=2) sconv2 = tf.layers.conv2d( inputs=spool1, filters=80, kernel_size=[5, 5], padding="same", kernel_initializer=U.normc_initializer(0.01), activation=tf.nn.leaky_relu) spool2 = tf.layers.max_pooling2d(inputs=sconv2, pool_size=[2, 2], strides=2) spool2_flat = tf.reshape(spool2, [-1, 16 * 16 * 80]) info_fc = tf.layers.dense(inputs=layers.flatten(info), units=8, activation=tf.tanh) aa_fc = tf.layers.dense(inputs=layers.flatten(available_action), units=32, activation=tf.tanh) HIDDEN_SIZE = 128 l1_action = tf.layers.dense(layers.flatten(last_action), 256, tf.nn.relu) input_to_rnn = tf.reshape(l1_action, [-1, 16, 16]) action_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=HIDDEN_SIZE, forget_bias=1.0, state_is_tuple=True) inputs_rnn = tf.unstack(input_to_rnn, num=16, axis=1) rnn_outputs,rnn_state= tf.contrib.rnn.static_rnn(action_lstm_cell, inputs_rnn, dtype=tf.float32) l2_action = tf.layers.dense(rnn_state[-1], 128, tf.nn.tanh) # hidden layer last_acs_ph_lstm = tf.layers.dense(l2_action, 32, tf.nn.tanh) last_out = tf.concat([mpool2_flat, spool2_flat, info_fc, aa_fc, last_acs_ph_lstm], axis=1) vf_last_out = tf.nn.tanh(U.dense(last_out, 1024, 'vf_last_out', weight_init=U.normc_initializer(1.0))) # vf_last_out_2 = tf.nn.tanh(U.dense(vf_last_out, 64, 'vf_last_out_2', # weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(vf_last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0] if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pol_last_out = U.dense(last_out, (pdtype.param_shape()[0])*5, "polfinaldense", U.normc_initializer(0.01)) pdparam = U.dense(pol_last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(available_action), self.pd.mode(available_action)) self.ac = ac self._act = U.function([stochastic, ob, last_action], [ac, self.vpred])
def __init__(self, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): self.scope = scope # self.observation_shape = env.observation_space.shape # self.actions_shape = env.action_space.shape # print('~~~~~~~~~~', self.observation_shape, self.actions_space) self.msize = 64 # change to 64 later self.ssize = 64 self.isize = 11 self.available_action_size = 524 from gym import spaces self.ob_space = spaces.Box( low=-1000, high=10000, shape=(5 * self.msize * self.msize + 10 * self.ssize * self.ssize + self.isize + self.available_action_size, )) self.ac_space = spaces.Discrete(self.available_action_size) self.observation_shape = self.ob_space.shape self.actions_shape = self.ac_space.shape self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, self.generator_last_action_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, self.expert_last_action_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) self.generator_acc = generator_acc expert_acc = tf.reduce_mean( tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) self.expert_acc = expert_acc # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff * entropy # Loss + Accuracy terms self.losses = [ generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc ] self.loss_name = [ "generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc" ] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy # make it larger, the network is large, it may vanish if reward is small # take generator_loss into consideration, since logits = 0.4 and logits equal to 0.1 are considered same otherwise self.reward_op = 100 * ( -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8) + generator_loss) var_list = self.get_trainable_variables() self.lossandgrad = U.function([ self.generator_obs_ph, self.generator_acs_ph, self.generator_last_action_ph, self.expert_obs_ph, self.expert_acs_ph, self.expert_last_action_ph ], self.losses + [U.flatgrad(self.total_loss, var_list)])
def learn( env, policy_func, discriminator, expert_dataset, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.001, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=1.5e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None, timesteps_per_actorbatch=16, clip_param=1e-5, adam_epsilon=4e-4, optim_epochs=1, optim_stepsize=4e-4, optim_batchsize=16, schedule='linear'): nworkers = MPI.COMM_WORLD.Get_size() print("##### nworkers: ", nworkers) rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = np.array([5*64*64 + 10*64*64 + 11 + 524]) # env.observation_space # ac_space = np.array([1]) #env.action_space from gym import spaces ob_space = spaces.Box(low=-1000, high=10000, shape=(5 * 64 * 64 + 10 * 64 * 64 + 11 + 524, )) ac_space = spaces.Discrete(524) pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, ob_space[0])) ac = pi.pdtype.sample_placeholder([None]) # prevac = pi.pdtype.sample_placeholder([None]) prevac_placeholder = U.get_placeholder_cached(name="last_action_one_hot") kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() # ent = pi.pd.entropy_usual() # see how it works, the value is the same meankl = U.mean(kloldnew) meanent = U.mean(ent) # entbonus = entcoeff * meanent # entcoeff = entcoeff * lrmult + 1e-5 pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, prevac_placeholder, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) g_adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function( [ob, ac, prevac_placeholder, atarg, ret, lrmult], losses) # all_var_list = pi.get_trainable_variables() # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] d_adam = MpiAdam(discriminator.get_trainable_variables()) # vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) g_adam.sync() d_adam.sync() # vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, expert_dataset, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=100) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # # if provide pretrained weight # if pretrained_weight is not None: # U.load_state(pretrained_weight, var_list=pi.get_variables()) # # if provieded model path # if load_model_path is not None: # U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / (max_timesteps + 1e7), 0.1) # make the smallest number as 0.1 instead of 0 else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) # def fisher_vector_product(p): # return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # # ------------------ Update G ------------------ logger.log("Optimizing Policy...") meanlosses = [] for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, prevac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "prevac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate # print("before standardize atarg value: ", atarg) if atarg.std() != 0: atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate else: with open("debug.txt", "a+") as f: print("atarg.std() is equal to 0", atarg, file=f) # print("atarg value: ", atarg) # convert prevac to one hot one_hot_prevac = [] if type(prevac) is np.ndarray: depth = prevac.size one_hot_prevac = np.zeros((depth, 524)) one_hot_prevac[np.arange(depth), prevac] = 1 else: one_hot_prevac = np.zeros(524) one_hot_prevac[prevac] = 1 one_hot_prevac = [one_hot_prevac] prevac = one_hot_prevac d = Dataset(dict(ob=ob, ac=ac, prevac=prevac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # print("optim_batchsize: ", optim_batchsize) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new( ) # set old parameter values to new parameter values logger.log(fmt_row(13, loss_names)) for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch['prevac'], batch["atarg"], batch["vtarg"], cur_lrmult) g_adam.update(g, optim_stepsize * cur_lrmult) # allmean(g) x_newlosses = compute_losses(batch["ob"], batch["ac"], batch["prevac"], batch["atarg"], batch["vtarg"], cur_lrmult) meanlosses = [x_newlosses] losses.append(x_newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # meanlosses = losses # # logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["ob"], batch["ac"], batch["prevac"], # batch["atarg"], batch["vtarg"], cur_lrmult) # losses.append(newlosses) # # # meanlosses,_,_ = mpi_moments(losses, axis=0) # it will be useful for multithreading meanlosses = np.mean(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) g_losses = meanlosses for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) global UP_TO_STEP ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch( len(ob), UP_TO_STEP) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch, prevac_batch in dataset.iterbatches( (ob, ac, prevac), include_final_partial_batch=False, batch_size=batch_size): # print("###### len(ob_batch): ", len(ob_batch)) ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch( len(ob_batch), UP_TO_STEP) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) depth = len(ac_batch) one_hot_ac_batch = np.zeros((depth, 524)) one_hot_ac_batch[np.arange(depth), ac_batch] = 1 # depth = len(prevac_batch) # one_hot_prevac_batch = np.zeros((depth, 524)) # one_hot_prevac_batch[np.arange(depth), prevac_batch] = 1 depth = len(ac_expert) one_hot_ac_expert = np.zeros((depth, 524)) one_hot_ac_expert[np.arange(depth), ac_expert] = 1 depth = len(prevac_expert) one_hot_prevac_expert = np.zeros((depth, 524)) one_hot_prevac_expert[np.arange(depth), prevac_expert] = 1 *newlosses, g = discriminator.lossandgrad(ob_batch, one_hot_ac_batch, prevac_batch, ob_expert, one_hot_ac_expert, one_hot_prevac_expert) global LAST_EXPERT_ACC, LAST_EXPERT_LOSS LAST_EXPERT_ACC = newlosses[5] LAST_EXPERT_LOSS = newlosses[1] d_adam.update(g, d_stepsize) # allmean(g) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) episodes_so_far = len(lens) timesteps_so_far = sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far) global ITER_SOFAR_GLOBAL ITER_SOFAR_GLOBAL = iters_so_far # log ac picked with open('ac.txt', 'a+') as fh: print(ac, file=fh)
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance( ob_space, gym.spaces.Box) #ru guo hou mian tiao jian wei jia ze tui chu #print ("mlp_policy/20lines") zhi xing liang ci #print ("ac_space.shape[0]", ac_space.shape[0]) shu chu jie guo shi 3 self.pdtype = pdtype = make_pdtype( ac_space ) #return DiagGaussianPdType(ac_space.shape[0]) zhe li mian zui hou you pdclass() sequence_length = None ob = U.get_placeholder( name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape) ) #return tf.placeholder(dtype=dtype, shape=shape, name=name) #print ("obspace.shape:::", list(ob_space.shape)) shu chu shi [11] with tf.variable_scope("obfilter"): #print("gail-tf/gailtf/baselines/ppo1/mlp_policy.py/28lines:") self.ob_rms = RunningMeanStd( shape=ob_space.shape) #zhe ge han shu kan bu dong obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #ob zhe ge shi hou hai shi placeholder last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer( 1.0))) #da jian le quan lian jie ceng self.vpred = U.dense( last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0) )[:, 0] #wen ti shi zhe li zui hou mei you shu chu dong zuo de kongjian last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): print("gaussian_fixed_var is used") mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer()) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: #print ("gaussian_fixed_var is not used") mei you bei yong dao pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat( pdparam ) # mo rren shang mian de pdtype yi ding shi DiagGaussianPd return DiagGaussianPd #pd li mian you kl, entropy, sample deng fang fa self.state_in = [] self.state_out = [] # change for BC #stochastic = tf.placeholder(dtype=tf.bool, shape=()) stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): print("max_timrsteps", max_timesteps) print("max_episodes", max_episodes) print("max_iters", max_iters) print("max_seconds", max_seconds) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration : r_t(\theta)*A_t surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #更新則のCLIP項 pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) 目的関数 vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): #更新部 *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) #ADAMでgをアップデート losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) print("... EpisodesSoFar ", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) print("... TimestepsSoFar ", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) print("... TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()