def test_seg_gen(sequence_size=1000, attention_size=30, hidden_size=30, env_id='Hopper-v1', cell_type='lstm'): from gailtf.baselines.ppo1 import mlp_policy from gailtf.network.adversary_traj import TrajectoryClassifier import gym env = gym.make("Hopper-v1") def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=64, num_hid_layers=2) ob_space = env.observation_space ac_space = env.action_space pi = policy_fn('pi', ob_space, ac_space) discriminator = TrajectoryClassifier(env, hidden_size, sequence_size, attention_size, cell_type) U.make_session(num_cpu=2).__enter__() U.initialize() seg_gen = traj_segment_generator(pi, env, discriminator, 10, True, sequence_size) for i in range(10): seg = seg_gen.__next__() ob, ac = traj2trans(seg["ep_trajs"], seg["ep_lens"], ob_space.shape[0]) add_vtarg_and_adv(seg, gamma=0.995, lam=0.97) print(seg['adv'].shape, seg['tdlamret'].shape, seg['ob'].shape, seg['nextvpred'])
def evaluate(env, policy_func, load_model_path, timesteps_per_batch, number_trajs=10, stochastic_policy=False): from tqdm import tqdm # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=False) U.initialize() # Prepare for rollouts # ---------------------------------------- ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) U.load_state(load_model_path) len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = ep_gen.__next__() ep_len, ep_ret = traj['ep_len'], traj['ep_ret'] len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') print("Average length:", sum(len_list) / len(len_list)) print("Average return:", sum(ret_list) / len(ret_list))
def test_dist(): np.random.seed(0) p1, p2, p3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) q1, q2, q3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) comm = MPI.COMM_WORLD assert comm.Get_size() == 2 if comm.Get_rank() == 0: x1, x2, x3 = p1, p2, p3 elif comm.Get_rank() == 1: x1, x2, x3 = q1, q2, q3 else: assert False rms = RunningMeanStd(epsilon=0.0, shape=(1, )) U.initialize() rms.update(x1) rms.update(x2) rms.update(x3) bigvec = np.concatenate([p1, p2, p3, q1, q2, q3]) def checkallclose(x, y): print(x, y) return np.allclose(x, y) assert checkallclose(bigvec.mean(axis=0), U.eval(rms.mean)) assert checkallclose(bigvec.std(axis=0), U.eval(rms.std))
def learn(args, env, policy_func, dataset, optim_batch_size=128, adam_epsilon=1e-5, optim_stepsize=3e-4): # ============================== INIT FROM ARGS ================================== max_iters = args.BC_max_iter pretrained = args.pretrained ckpt_dir = args.checkpoint_dir log_dir = args.log_dir task_name = args.task_name val_per_iter = int(max_iters / 10) pi = policy_func(args, "pi", env) # Construct network for new policy oldpi = policy_func(args, "oldpi", env) # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters + 1))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, max_to_keep=args.max_to_keep) return savedir_fname
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None): val_per_iter = int(max_iters / 10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac - pi.ac)) #エキスパート行動と方策行動の差の2乗の平均 var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) #状態,行動,確率的方策(bool)を入力,loss(エキスパート行動と方策行動の差の2乗の平均)andその勾配を出力 if not pretrained: writer = U.FileWriter(log_dir) ep_stats = stats(["Loss"]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if not pretrained: ep_stats.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') loss, g = lossandgrad(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) U.initialize() x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.std(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = U.eval([rms.mean, rms.std]) assert np.allclose(ms1, ms2)
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def test(expert_path, sequence_size=1000, attention_size=30, hidden_size=30, env_id='Hopper-v1', cell_type='lstm'): from gailtf.dataset.mujoco_traj import Mujoco_Traj_Dset import gym U.make_session(num_cpu=2).__enter__() dset = Mujoco_Traj_Dset(expert_path) env = gym.make(env_id) t1, tl1 = dset.get_next_traj_batch(10) t2, tl2 = dset.get_next_traj_batch(10) discriminator = TrajectoryClassifier(env, hidden_size, sequence_size, attention_size, cell_type) U.initialize() *losses, g = discriminator.lossandgrad(t1, tl1, t2, tl2, 0.5) rs1 = discriminator.get_rewards(t1, tl1) #cv1,cv2 = discriminator.check_values(t1,tl1,t2,tl2,0.5) print(rs1.shape)
def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e3, adam_epsilon=1e-6, optim_stepsize=2e-4, ckpt_dir=None, log_dir=None, task_name=None, high_level=False): val_per_iter = int(max_iters / 100) ob_space = env.observation_space ac_space = env.action_space start_time = time.time() if not high_level: pi_low = policy_func("pi_low", ob_space, ac_space.spaces[1]) # placeholder # ob_low = U.get_placeholder_cached(name="ob") ob_low = pi_low.ob ac_low = pi_low.pdtype.sample_placeholder([None]) # stochastic_low = U.get_placeholder_cached(name="stochastic") stochastic_low = pi_low.stochastic loss_low = tf.reduce_mean(tf.square(ac_low - pi_low.ac)) var_list_low = pi_low.get_trainable_variables() adam_low = MpiAdam(var_list_low, epsilon=adam_epsilon) lossandgrad_low = U.function([ob_low, ac_low, stochastic_low], [loss_low] + [U.flatgrad(loss_low, var_list_low)]) if not pretrained: writer = U.FileWriter(log_dir) ep_stats_low = stats(["Loss_low"]) U.initialize() adam_low.sync() logger.log("Pretraining with Behavior Cloning Low...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch( optim_batch_size, 'train', high_level) loss, g = lossandgrad_low(ob_expert, ac_expert, True) adam_low.update(g, optim_stepsize) if not pretrained: ep_stats_low.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch( -1, 'val', high_level) loss, g = lossandgrad_low(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi_low.get_variables()) return savedir_fname else: pi_high = policy_func("pi_high", ob_space, ac_space.spaces[0]) # high -> action_label # ob_high = U.get_placeholder_cached(name="ob") ob_high = pi_high.ob ac_high = pi_high.pdtype.sample_placeholder([None, 1]) onehot_labels = tf.one_hot(indices=tf.cast(ac_high, tf.int32), depth=3) # stochastic_high = U.get_placeholder_cached(name="stochastic") stochastic_high = pi_high.stochastic cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=pi_high.logits, labels=onehot_labels) loss_high = tf.reduce_mean(cross_entropy) var_list_high = pi_high.get_trainable_variables() adam_high = MpiAdam(var_list_high, epsilon=adam_epsilon) lossandgrad_high = U.function([ob_high, ac_high, stochastic_high], [loss_high] + [U.flatgrad(loss_high, var_list_high)]) # train high level policy if not pretrained: writer = U.FileWriter(log_dir) # ep_stats_low = stats(["Loss_low"]) ep_stats_high = stats(["loss_high"]) U.initialize() adam_high.sync() logger.log("Pretraining with Behavior Cloning High...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch( optim_batch_size, 'train', high_level) loss, g = lossandgrad_high(ob_expert, ac_expert, True) adam_high.update(g, optim_stepsize) if not pretrained: ep_stats_high.add_all_summary(writer, [loss], iter_so_far) if iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch( -1, 'val', high_level) loss, g = lossandgrad_high(ob_expert, ac_expert, False) logger.log("Validation:") logger.log("Loss: %f" % loss) if not pretrained: U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far) if pretrained: savedir_fname = tempfile.TemporaryDirectory().name U.save_state(savedir_fname, var_list=pi_high.get_variables()) return savedir_fname print("--- %s seconds ---" % (time.time() - start_time))
def learn( env, policy_func, discriminator, expert_dataset, pretrained, pretrained_weight, *, g_step, d_step, episodes_per_batch, # what to train on dropout_keep_prob, sequence_size, #rnn parameters max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("pol") ] vf_var_list = [ v for v in all_var_list if v.name.split("/")[1].startswith("vf") ] d_adam = MpiAdam(discriminator.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n( [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, episodes_per_batch, stochastic=True, seq_length=sequence_size) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) # if provieded model path if load_model_path is not None: U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) traj_gen, traj_len_gen = seg["ep_trajs"], seg["ep_lens"] #traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch() batch_size = len(traj_gen) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for traj_batch, traj_len_batch in dataset.iterbatches( (traj_gen, traj_len_gen), include_final_partial_batch=False, batch_size=batch_size): traj_expert, traj_len_expert = expert_dataset.get_next_traj_batch( len(traj_batch)) # update running mean/std for discriminator ob_batch, _ = traj2trans(traj_batch, traj_len_batch, ob_space.shape[0]) ob_expert, _ = traj2trans(traj_expert, traj_len_expert, ob_space.shape[0]) if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = discriminator.lossandgrad(traj_batch, traj_len_batch, traj_expert, traj_len_expert, dropout_keep_prob) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far)
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=100, ckpt_dir=None, task="train", sample_stochastic=True, load_model_path=None, task_name=None, max_sample_traj=1500): print("max_timrsteps", max_timesteps) print("max_episodes", max_episodes) print("max_iters", max_iters) print("max_seconds", max_seconds) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration : r_t(\theta)*A_t surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #更新則のCLIP項 pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) 目的関数 vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if task == 'sample_trajectory': # not elegant, i know :( sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): #更新部 *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) #ADAMでgをアップデート losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) print("... EpisodesSoFar ", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) print("... TimestepsSoFar ", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) print("... TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def evaluate(env, policy_func, load_model_path, timesteps_per_batch, number_trajs=10, stochastic_policy=False): # have it play with scripted bot for one full game ob_space = spaces.Box(low=-1000, high=10000, shape=(5 * 64 * 64 + 10 * 64 * 64 + 11 + 524, )) ac_space = spaces.Discrete(524) pi = policy_func("pi", ob_space, ac_space, reuse=False) U.initialize() U.load_state(load_model_path) original_graph = tf.Graph() param_sess = tf.Session(graph=original_graph) saved_model_path = os.path.expanduser( '~' ) + '/pysc2-gail-research-project/supervised_learning_baseline/param_pred_model/action_params' with original_graph.as_default(): saver = tf.train.import_meta_graph(saved_model_path + '.meta', clear_devices=True) saver.restore(param_sess, saved_model_path) # placeholder minimap_placeholder = original_graph.get_tensor_by_name( "minimap_placeholder:0") screen_placeholder = original_graph.get_tensor_by_name( "screen_placeholder:0") user_info_placeholder = original_graph.get_tensor_by_name( "user_info_placeholder:0") action_placeholder = original_graph.get_tensor_by_name( "action_placeholder:0") # ops control_group_act_cls = original_graph.get_tensor_by_name( "control_group_act_cls:0") screen_output_pred = original_graph.get_tensor_by_name( "screen_param_prediction:0") minimap_output_pred = original_graph.get_tensor_by_name( "minimap_param_prediction:0") screen2_output_pred = original_graph.get_tensor_by_name( "screen2_param_prediction:0") queued_pred_cls = original_graph.get_tensor_by_name("queued_pred_cls:0") control_group_id_output = original_graph.get_tensor_by_name( "control_group_id_output:0") select_point_act_cls = original_graph.get_tensor_by_name( "select_point_act_cls:0") select_add_pred_cls = original_graph.get_tensor_by_name( "select_add_pred_cls:0") select_unit_act_cls = original_graph.get_tensor_by_name( "select_unit_act_cls:0") select_unit_id_output = original_graph.get_tensor_by_name( "select_unit_id_output:0") select_worker_cls = original_graph.get_tensor_by_name( "select_worker_cls:0") build_queue_id_output = original_graph.get_tensor_by_name( "build_queue_id_output:0") unload_id_output = original_graph.get_tensor_by_name("unload_id_output:0") timesteps = env.reset() state_dict, ob = extract_observation(timesteps[0]) is_done = False ac = 0 while is_done == False: prevac = ac ac, vpred = pi.act(stochastic_policy, ob, prevac) function_type = sc_action.FUNCTIONS[ac].function_type.__name__ one_hot_ac = np.zeros((1, 524)) # shape will be 1*254 one_hot_ac[np.arange(1), [ac]] = 1 ac_args = [] reshaped_minimap = np.reshape(np.array(state_dict['minimap']), (64, 64, 5)) reshaped_screen = np.reshape(np.array(state_dict['screen']), (64, 64, 10)) feed_dict = { minimap_placeholder: [reshaped_minimap], screen_placeholder: [reshaped_screen], action_placeholder: one_hot_ac, user_info_placeholder: [state_dict['player']] } if function_type == 'move_camera': temp_arg1 = param_sess.run( [minimap_output_pred], feed_dict) # temp_arg1 is look like [[[x, y]]] # shape of minimap output is different from screen and screen2 temp_arg1 = process_coordinates_param_nn_output(temp_arg1[0]) ac_args.append(temp_arg1) elif function_type == 'select_point': temp_arg1, temp_arg2 = param_sess.run( [select_point_act_cls, screen_output_pred], feed_dict) ac_args.append(temp_arg1) temp_arg2 = process_coordinates_param_nn_output(temp_arg2) ac_args.append(temp_arg2) elif function_type == 'select_rect': temp_arg1, temp_arg2, temp_arg3 = param_sess.run( [select_add_pred_cls, screen_output_pred, screen2_output_pred], feed_dict) ac_args.append(temp_arg1) temp_arg2 = process_coordinates_param_nn_output(temp_arg2) ac_args.append(temp_arg2) temp_arg3 = process_coordinates_param_nn_output(temp_arg3) ac_args.append(temp_arg3) elif function_type == 'select_unit': temp_arg1, temp_arg2 = param_sess.run( [select_unit_act_cls, select_unit_id_output], feed_dict) temp_arg1 = flatten_param(temp_arg1) temp_arg2 = flatten_param(temp_arg2) temp_arg2 = temp_arg2.astype(int) ac_args.append(temp_arg1) ac_args.append(temp_arg2) elif function_type == 'control_group': temp_arg1, temp_arg2 = param_sess.run( [control_group_act_cls, control_group_id_output], feed_dict) temp_arg1 = flatten_param(temp_arg1) temp_arg2 = flatten_param(temp_arg2) temp_arg2 = temp_arg2.astype(int) ac_args.append(temp_arg1) ac_args.append(temp_arg2) elif function_type == 'select_idle_worker': temp_arg1 = param_sess.run([select_worker_cls], feed_dict) temp_arg1 = flatten_param(temp_arg1) ac_args.append(temp_arg1) elif function_type == 'select_army': temp_arg1 = param_sess.run([select_add_pred_cls], feed_dict) temp_arg1 = flatten_param(temp_arg1) ac_args.append(temp_arg1) elif function_type == 'select_warp_gates': temp_arg1 = param_sess.run([select_add_pred_cls], feed_dict) temp_arg1 = flatten_param(temp_arg1) ac_args.append(temp_arg1) elif function_type == 'unload': temp_arg1 = param_sess.run([unload_id_output], feed_dict) temp_arg1 = flatten_param(temp_arg1) temp_arg1 = temp_arg1.astype(int) ac_args.append(temp_arg1) elif function_type == 'build_queue': temp_arg1 = param_sess.run([build_queue_id_output], feed_dict) temp_arg1 = flatten_param(temp_arg1) temp_arg1 = temp_arg1.astype(int) ac_args.append(temp_arg1) elif function_type == 'cmd_quick': temp_arg1 = param_sess.run([queued_pred_cls], feed_dict) # print('cmd_quick queued param:', temp_arg1) temp_arg1 = flatten_param(temp_arg1) ac_args.append(temp_arg1) elif function_type == 'cmd_screen': temp_arg1, temp_arg2 = param_sess.run( [queued_pred_cls, screen_output_pred], feed_dict) temp_arg1 = np.array(temp_arg1) temp_arg1 = temp_arg1.flatten() ac_args.append(temp_arg1) temp_arg2 = process_coordinates_param_nn_output(temp_arg2) ac_args.append(temp_arg2) elif function_type == 'cmd_minimap': temp_arg1, temp_arg2 = param_sess.run( [queued_pred_cls, minimap_output_pred], feed_dict) temp_arg1 = np.array(temp_arg1) temp_arg1 = temp_arg1.flatten() ac_args.append(temp_arg1) temp_arg2 = process_coordinates_param_nn_output(temp_arg2[0]) ac_args.append(temp_arg2) elif function_type == 'no_op' or function_type == 'select_larva' or function_type == 'autocast': # do nothing pass else: print("UNKNOWN FUNCTION TYPE: ", function_type) # print(ac_args) ac_with_param = sc_action.FunctionCall(ac, ac_args) print('take action with param: ', ac_with_param) timesteps = env.step([ac_with_param]) print('env reward: ', timesteps[0].reward) state_dict, ob = extract_observation(timesteps[0], ac) is_done = timesteps[0].last()
def learn( env, policy_func, discriminator, expert_dataset, pretrained, pretrained_weight, *, g_step, d_step, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.001, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=1.5e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, save_per_iter=100, ckpt_dir=None, log_dir=None, load_model_path=None, task_name=None, timesteps_per_actorbatch=16, clip_param=1e-5, adam_epsilon=4e-4, optim_epochs=1, optim_stepsize=4e-4, optim_batchsize=16, schedule='linear'): nworkers = MPI.COMM_WORLD.Get_size() print("##### nworkers: ", nworkers) rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- # ob_space = np.array([5*64*64 + 10*64*64 + 11 + 524]) # env.observation_space # ac_space = np.array([1]) #env.action_space from gym import spaces ob_space = spaces.Box(low=-1000, high=10000, shape=(5 * 64 * 64 + 10 * 64 * 64 + 11 + 524, )) ac_space = spaces.Discrete(524) pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=(None, ob_space[0])) ac = pi.pdtype.sample_placeholder([None]) # prevac = pi.pdtype.sample_placeholder([None]) prevac_placeholder = U.get_placeholder_cached(name="last_action_one_hot") kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() # ent = pi.pd.entropy_usual() # see how it works, the value is the same meankl = U.mean(kloldnew) meanent = U.mean(ent) # entbonus = entcoeff * meanent # entcoeff = entcoeff * lrmult + 1e-5 pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, prevac_placeholder, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) g_adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function( [ob, ac, prevac_placeholder, atarg, ret, lrmult], losses) # all_var_list = pi.get_trainable_variables() # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] d_adam = MpiAdam(discriminator.get_trainable_variables()) # vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out writer = U.FileWriter(log_dir) U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) g_adam.sync() d_adam.sync() # vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, expert_dataset, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=100) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 g_loss_stats = stats(loss_names) d_loss_stats = stats(discriminator.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # # if provide pretrained weight # if pretrained_weight is not None: # U.load_state(pretrained_weight, var_list=pi.get_variables()) # # if provieded model path # if load_model_path is not None: # U.load_state(load_model_path) while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / (max_timesteps + 1e7), 0.1) # make the smallest number as 0.1 instead of 0 else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log("********** Iteration %i ************" % iters_so_far) # def fisher_vector_product(p): # return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # # ------------------ Update G ------------------ logger.log("Optimizing Policy...") meanlosses = [] for _ in range(g_step): with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, prevac, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "prevac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate # print("before standardize atarg value: ", atarg) if atarg.std() != 0: atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate else: with open("debug.txt", "a+") as f: print("atarg.std() is equal to 0", atarg, file=f) # print("atarg value: ", atarg) # convert prevac to one hot one_hot_prevac = [] if type(prevac) is np.ndarray: depth = prevac.size one_hot_prevac = np.zeros((depth, 524)) one_hot_prevac[np.arange(depth), prevac] = 1 else: one_hot_prevac = np.zeros(524) one_hot_prevac[prevac] = 1 one_hot_prevac = [one_hot_prevac] prevac = one_hot_prevac d = Dataset(dict(ob=ob, ac=ac, prevac=prevac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] # print("optim_batchsize: ", optim_batchsize) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new( ) # set old parameter values to new parameter values logger.log(fmt_row(13, loss_names)) for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch['prevac'], batch["atarg"], batch["vtarg"], cur_lrmult) g_adam.update(g, optim_stepsize * cur_lrmult) # allmean(g) x_newlosses = compute_losses(batch["ob"], batch["ac"], batch["prevac"], batch["atarg"], batch["vtarg"], cur_lrmult) meanlosses = [x_newlosses] losses.append(x_newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # meanlosses = losses # # logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["ob"], batch["ac"], batch["prevac"], # batch["atarg"], batch["vtarg"], cur_lrmult) # losses.append(newlosses) # # # meanlosses,_,_ = mpi_moments(losses, axis=0) # it will be useful for multithreading meanlosses = np.mean(losses, axis=0) # logger.log(fmt_row(13, meanlosses)) g_losses = meanlosses for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, discriminator.loss_name)) global UP_TO_STEP ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch( len(ob), UP_TO_STEP) batch_size = len(ob) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch, prevac_batch in dataset.iterbatches( (ob, ac, prevac), include_final_partial_batch=False, batch_size=batch_size): # print("###### len(ob_batch): ", len(ob_batch)) ob_expert, ac_expert, prevac_expert = expert_dataset.get_next_batch( len(ob_batch), UP_TO_STEP) # update running mean/std for discriminator if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) depth = len(ac_batch) one_hot_ac_batch = np.zeros((depth, 524)) one_hot_ac_batch[np.arange(depth), ac_batch] = 1 # depth = len(prevac_batch) # one_hot_prevac_batch = np.zeros((depth, 524)) # one_hot_prevac_batch[np.arange(depth), prevac_batch] = 1 depth = len(ac_expert) one_hot_ac_expert = np.zeros((depth, 524)) one_hot_ac_expert[np.arange(depth), ac_expert] = 1 depth = len(prevac_expert) one_hot_prevac_expert = np.zeros((depth, 524)) one_hot_prevac_expert[np.arange(depth), prevac_expert] = 1 *newlosses, g = discriminator.lossandgrad(ob_batch, one_hot_ac_batch, prevac_batch, ob_expert, one_hot_ac_expert, one_hot_prevac_expert) global LAST_EXPERT_ACC, LAST_EXPERT_LOSS LAST_EXPERT_ACC = newlosses[5] LAST_EXPERT_LOSS = newlosses[1] d_adam.update(g, d_stepsize) # allmean(g) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer)) # logger.record_tabular("EpThisIter", len(lens)) episodes_so_far = len(lens) timesteps_so_far = sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() g_loss_stats.add_all_summary(writer, g_losses, iters_so_far) d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far) ep_stats.add_all_summary(writer, [ np.mean(true_rewbuffer), np.mean(rewbuffer), np.mean(lenbuffer) ], iters_so_far) global ITER_SOFAR_GLOBAL ITER_SOFAR_GLOBAL = iters_so_far # log ac picked with open('ac.txt', 'a+') as fh: print(ac, file=fh)
def replayACS(env, modelPath, transpose=True, fps=30, zoom=None): """ Replays a game from recorded trajectories using actions This method is not precise though, because it indirectly recovers environment states from actions. Sometimes it gets asynchronous and distorts the real trajectory. :param env: Atari environment :param modelPath: path to trained model :param transpose: :param fps: :param zoom: :return: """ global obs with open(modelPath, 'rb') as rfp: trajectories = pkl.load(rfp) U.make_session(num_cpu=1).__enter__() U.initialize() tempEnv = env while not isinstance(tempEnv, ActionWrapper): try: tempEnv = tempEnv.env except: break # using ActionWrapper: if isinstance(tempEnv, ActionWrapper): obs_s = tempEnv.screen_space else: obs_s = env.observation_space # assert type(obs_s) == Box assert len(obs_s.shape) == 2 or (len(obs_s.shape) == 3 and obs_s.shape[2] in [1, 3]) if zoom is None: zoom = 1 video_size = int(obs_s.shape[0] * zoom), int(obs_s.shape[1] * zoom) if transpose: video_size = tuple(reversed(video_size)) # setup the screen using pygame flags = RESIZABLE | HWSURFACE | DOUBLEBUF screen = pygame.display.set_mode(video_size, flags) pygame.event.set_blocked(pygame.MOUSEMOTION) clock = pygame.time.Clock() # ================================================================================================================= running = True envDone = False playerScore = opponentScore = 0 wins = losses = ties = gamesTotal = totalPlayer = totalOpponent = 0 while running: trl = len(trajectories) for i in range(trl): obs = env.reset() print("\nRunning trajectory {}".format(i)) print("Length {}".format(len(trajectories[i]['ac']))) for ac in tqdm(trajectories[i]['ac']): if not isinstance(ac, list): ac = np.atleast_1d(ac) obs, reward, envDone, info = env.step(ac) # track of player score: if reward > 0: playerScore += abs(reward) else: opponentScore += abs(reward) if hasattr(env, 'getImage'): obs = env.getImage() if obs is not None: if len(obs.shape) == 2: obs = obs[:, :, None] if obs.shape[2] == 1: obs = obs.repeat(3, axis=2) display_arr(screen, obs, video_size, transpose) pygame.display.flip() clock.tick(fps) msg = format("End of game: score %d - %d" % (playerScore, opponentScore)) print(colorize(msg, color='red')) gamesTotal += 1 if playerScore > opponentScore: wins += 1 elif opponentScore > playerScore: losses += 1 else: ties += 1 totalPlayer += playerScore totalOpponent += opponentScore playerScore = opponentScore = 0 msg = format("Status so far: \nGames played - %d wins - %d losses - %d ties - %d\n Total score: %d - %d" % ( gamesTotal, wins, losses, ties, totalPlayer, totalOpponent)) print(colorize(msg, color='red')) pygame.quit()