def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
def test_multikwargs(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_function(): with tf.Graph().as_default(): x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10
def test_function(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") y = tf.placeholder(tf.int32, (), name="y") z = 3 * x + 2 * y lin = function([x, y], z, givens={y: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(x=3) == 9 assert lin(2, 2) == 10 assert lin(x=2, y=3) == 12
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) U.initialize() x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.std(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = U.eval([rms.mean, rms.std]) assert np.allclose(ms1, ms2)
def test_multikwargs(): tf.reset_default_graph() x = tf.placeholder(tf.int32, (), name="x") with tf.variable_scope("other"): x2 = tf.placeholder(tf.int32, (), name="x") z = 3 * x + 2 * x2 lin = function([x, x2], z, givens={x2: 0}) with single_threaded_session(): initialize() assert lin(2) == 6 assert lin(2, 2) == 10 expt_caught = False try: lin(x=2) except AssertionError: expt_caught = True assert expt_caught
def test_dist(): np.random.seed(0) p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) comm = MPI.COMM_WORLD assert comm.Get_size()==2 if comm.Get_rank()==0: x1,x2,x3 = p1,p2,p3 elif comm.Get_rank()==1: x1,x2,x3 = q1,q2,q3 else: assert False rms = RunningMeanStd(epsilon=0.0, shape=(1,)) U.initialize() rms.update(x1) rms.update(x2) rms.update(x3) bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) def checkallclose(x,y): print(x,y) return np.allclose(x,y) assert checkallclose( bigvec.mean(axis=0), U.eval(rms.mean) ) assert checkallclose( bigvec.std(axis=0), U.eval(rms.std) )
def runner(env, policy_func, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj['ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list)/len(len_list) avg_ret = sum(ret_list)/len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') wd_dict = {} h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) wd_loss = tf.get_collection("vf_losses", None) loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) self._predict = U.function([X], vpred_n) optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ async=1, kfac_update=2, cold_iter=50, \ weight_decay_dict=wd_dict, max_grad_norm=None) vf_var_list = [] for var in tf.trainable_variables(): if "vf" in var.name: vf_var_list.append(var) update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 U.initialize() # Initialize uninitialized TF variables
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # to store (timestep, min_reward, max_reward, avg_reward) tuples from each # iteration graph_data = [] # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # learning rate multiplier, updated according to the schedule lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) #clip_param = clip_param * lrmult # Annealed cliping parameter epislon # retrieve ob placeholder ob = U.get_placeholder_cached(name="ob") # get a 2d placeholder for a list of 1d action ac = pi.pdtype.sample_placeholder([None]) # get tensor for the KL-divergence between the old and new action gaussians kloldnew = oldpi.pd.kl(pi.pd) # get tensor for the entropy of the new action gaussian ent = pi.pd.entropy() # take the mean of all kl divergences and entropies meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # take the average to get the expected value of the current batch pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() # TODO remove experimenting - name_correspondences = \ { "fc1/kernel":"fc.0.weight", "fc1/bias":"fc.0.bias", "fc2/kernel":"fc.1.weight", "fc2/bias":"fc.1.bias", "final/kernel":"fc.2.weight", "final/bias":"fc.2.bias", } for tf_name, torch_name, out_var in [('pi/vf', 'value_net', pi.vpred),\ ('pi/pol', 'pol_net', pi.mean)]: value_dict = {} prefix = tf_name for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, prefix): name = var.name[(len(prefix) + 1):-2] if "logstd" in name: continue value = tf.get_default_session().run(var) kernel = "kernel" in name if kernel: value = value.T value_dict[name_correspondences[name]] = value with open(torch_name + '_state_dict', \ 'wb+') as file: pickle.dump(value_dict, file) print(tf.get_default_session().run(out_var,\ feed_dict={ob:np.array([[1.0, 2.0, 3.0, 4.0]])})) # - TODO remove experimenting adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" result = 0 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # TODO remove with open('acs', 'wb+') as file: pickle.dump(seg['ac'], file) # remove TODO # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] assign_old_eq_new() # set old parameter values to new parameter values # TODO remove *losses, g = lossandgrad(seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"], cur_lrmult) ratio_res = tf.get_default_session().run(ratio, feed_dict=\ { lossandgrad.inputs[0]: seg["ob"], lossandgrad.inputs[1]: seg["ac"], lossandgrad.inputs[2]: seg["adv"], lossandgrad.inputs[3]: seg["tdlamret"] } ) surr_res = tf.get_default_session().run(surr1, feed_dict=\ { lossandgrad.inputs[0]: seg["ob"], lossandgrad.inputs[1]: seg["ac"], lossandgrad.inputs[2]: seg["adv"], lossandgrad.inputs[3]: seg["tdlamret"] } ) pred_vals = tf.get_default_session().run(pi.vpred, feed_dict=\ { lossandgrad.inputs[0]: seg["ob"], lossandgrad.inputs[1]: seg["ac"], lossandgrad.inputs[2]: seg["adv"], lossandgrad.inputs[3]: seg["tdlamret"] }) print(losses[0], losses[2]) print(seg["tdlamret"]) print(pred_vals) import sys sys.exit() # remove TODO vpredbefore = seg["vpred"] # predicted value function before udpate #atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) #logger.log(fmt_row(13, newlosses)) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) rewmean = np.mean(rewbuffer) rewmin = np.min(rewbuffer) rewmax = np.max(rewbuffer) timesteps_so_far += sum(lens) graph_data.append((timesteps_so_far, rewmin, rewmax, rewmean)) result = rewmean logger.record_tabular("EpRewMean", rewmean) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # TODO remove import sys sys.exit() return pi, result, graph_data
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, initial_exploration_p=1.0, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=100, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, pretraining_obs=None, pretraining_targets=None, pretrain_steps=1000, pretrain_experience=None, pretrain_num_episodes=0, double_q=True, expert_qfunc=None, pretrain_lr=1e-4, sampling_starts=0, beb_agent=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) beb_agent: takes Q values and suggests actions after adding beb bonus **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) nenvs = env.num_envs print("Bayes-DeepQ:", env.num_envs) print("Total timesteps", total_timesteps) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) def make_belief_ph(name): return ObservationInput(env.belief_space, name=name) act, train, update_target, train_target, copy_target_to_q, debug = bqn.build_train( make_obs_ph=make_obs_ph, make_belief_ph=make_belief_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), pretrain_optimizer=tf.train.AdamOptimizer(learning_rate=pretrain_lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=double_q) act_params = { 'make_obs_ph': make_obs_ph, 'make_belief_ph': make_belief_ph, 'q_funcs': q_funcs, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=initial_exploration_p, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") print("Model will be saved at ", model_file) model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) print('Loaded model from {}'.format(load_path)) if pretraining_obs is not None: # pretrain target and copy to qfunc print("Pretrain steps ", pretrain_steps) for i in range(pretrain_steps): pretrain_errors = train_target(pretraining_obs, pretraining_targets) if i % 500 == 0: print("Step {}".format(i), np.mean(pretrain_errors)) if np.mean(pretrain_errors) < 1e-5: break min_rew = 0 # copy all pre-experiences if pretrain_experience is not None: for obs, action, rew, new_obs, done in zip(*pretrain_experience): replay_buffer.add(obs, action, rew, new_obs, float(done)) print("Added {} samples to ReplayBuffer".format( len(replay_buffer._storage))) min_rew = min(rew, min_rew) print("Pretrain Error", np.mean(pretrain_errors)) # Bellman for t in range(pretrain_steps): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, belief_t, actions, rewards, obses_tp1, belief_tp1, dones, weights) print("Pretrain TD Error {}/{}".format(t, pretrain_steps), np.mean(td_errors)) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if target_network_update_freq is not None \ and t % target_network_update_freq == 0: # Update target network periodically. print("Update target") update_target() else: print("Skipping pretraining") update_target() print("Save the pretrained model", model_file) save_variables(model_file) episode_reward = np.zeros(nenvs, dtype=np.float32) saved_mean_reward = None obs = env.reset() reset = True epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_episodes = 0 episode_rewards_history = deque(maxlen=100) episode_step = np.zeros(nenvs, dtype=int) episodes = 0 #scalar start = 0 if expert_qfunc is None: aggrevate_steps = 0 with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") print("Aggrevate: Model will be saved at ", model_file) model_saved = False t = 0 # could start from pretrain-steps epoch = 0 while True: epoch += 1 if t >= total_timesteps: break if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} update_eps = exploration.value(t) update_param_noise_threshold = 0. # no randomization # update_eps = 0 print('update_eps', int(100 * exploration.value(t))) qv_error = [] obs = env.reset() for m in range(10): action, q_values = act(np.array(obs)[None], update_eps=update_eps, **kwargs) if beb_agent is not None: action = beb_agent.step(obs, action, q_values, exploration.value(t)) # if expert_qfunc is not None: # v = expert_qfunc.value(obs) # qv_error += [v - q_values[0]] env_action = action reset = False new_obs, rew, done, info = env.step(env_action) if t >= sampling_starts: # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, done) obs = new_obs episode_reward += rew episode_step += 1 for d in range(len(done)): if done[d]: # Episode done. # discount(np.array(rewards), gamma) consider doing discount epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 t += 10 * nenvs if t > learning_starts: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if target_network_update_freq is not None and t > sampling_starts \ and epoch % target_network_update_freq == 0: # Update target network periodically. print("Update target") update_target() mean_1000ep_reward = round(np.mean(episode_rewards_history), 2) num_episodes = episodes if print_freq is not None: logger.record_tabular("steps", t) logger.record_tabular("td errors", np.mean(td_errors)) logger.record_tabular("td errors std", np.std(np.abs(td_errors))) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() print("episodes", num_episodes, "steps {}/{}".format(t, total_timesteps)) if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_1000ep_reward)) print("saving model") save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), ) # Create the replay buffer replay_buffer = ReplayBuffer(50000) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] obs = env.reset() for t in itertools.count(): # Take action and update exploration to the newest value action = act(obs[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) if t % 199 == 0: env.close()
def learn( env, q_func, lr=5e-4, max_timesteps=100, #DP DEL 000 buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape #need to replace in fake env def make_obs_ph(name): return U.BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() #DP replace with fake env reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action #DP action reset = False new_obs, rew, done, _ = env.step( env_action) #DP - CREATE env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # #DP EDIT # print("at step t " + str(t)) # print("printing obses_t, actions, rewards, obses_tp1, dones, weights") # print(obses_t, actions, rewards, obses_tp1, dones, weights) # print("%"*30) # import pdb; pdb.set_trace() td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() #DP DELETE OR MOD THIS SECTION mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) import pdb pdb.set_trace() model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, adaptive_kl): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(None, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) MEANNOW = train_model.pdtype.sample_placeholder([None]) LOGSTDNOW = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) NEGLOGPACNOW = tf.placeholder(tf.float32, [None]) RHO_NOW = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) KLCONST = tf.placeholder(tf.float32, []) KL_REST = tf.placeholder(tf.float32, [None]) neglogpac = train_model.pd.neglogp(A) mean = train_model.pd.mean logstd = train_model.pd.logstd # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(- 0.5 * tf.square((A - mean) / tf.exp(logstd)) - logstd + 0.5 * tf.square( (A - MEANNOW) / tf.exp(LOGSTDNOW)) + LOGSTDNOW) sgn = tf.ones_like(ratio) * tf.expand_dims(tf.sign(ADV), 1) ratio_clip = tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Defining Loss = - J is equivalent to max J r = tf.reduce_prod(sgn * tf.minimum(ratio * sgn, ratio_clip * sgn),axis=-1) pg_losses = - r * ADV / tf.stop_gradient(tf.reduce_mean(r)) # * tf.minimum(1.0,RHO_NOW) # Final PG loss # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST) approxklold = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxklnow = .5 * tf.reduce_mean(tf.square(neglogpac - NEGLOGPACNOW) * tf.minimum(1.0,RHO_NOW)) kloldnew = tf.reduce_mean(tf.reduce_sum( logstd - LOGSTDNOW + 0.5 * (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) / tf.square( tf.exp(logstd)) - 0.5, axis=1)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) pg_loss = tf.reduce_mean(pg_losses) # * tf.minimum(1.0,RHO_NOW)) # Total loss# * tf.minimum(1.0,RHO_NOW)) if adaptive_kl: pg_loss = pg_loss + KLCONST*approxkl # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') print(params) # 2. Build our trainer if MPI is not None: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) _grad_norm = tf.sqrt(tf.reduce_sum([tf.norm(grad) ** 2 for grad in grads])) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, klconst, rgae, trunc_rho, obs, returns, advs, masks, actions, values, neglogpacs, mean_now, logstd_now, kl_rest, rho_now, neglogpnow, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') # Normalize the advantages if rgae: r = np.minimum(trunc_rho,rho_now) radvs = r*advs advs = (advs - radvs.mean()/r.mean()) / (radvs.std() + 1e-8) else: advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values, MEANNOW:mean_now, LOGSTDNOW:logstd_now, KLCONST:klconst, KL_REST:kl_rest, RHO_NOW:rho_now, NEGLOGPACNOW:neglogpnow} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew, approxklold, approxklnow, _grad_norm, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'kloldnew', 'approxklold', 'approxklnow', 'gradnorm'] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.meanlogstd = act_model.meanlogstd self.value = act_model.value self.values = train_model.value self.meanlogstds = train_model.meanlogstd self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] #episode_minerals = [0.0] saved_mean_reward = None path_memory = np.zeros((64, 64)) obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 path_memory_ = np.array(path_memory, copy=True) if (action == 0): #UP if (player[1] >= 16): coord = [player[0], player[1] - 16] path_memory_[player[1] - 16:player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] path_memory_[0:player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 47): coord = [player[0], player[1] + 16] path_memory_[player[1]:player[1] + 16, player[0]] = -1 elif (player[1] > 47): coord = [player[0], 63] path_memory_[player[1]:63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 16): coord = [player[0] - 16, player[1]] path_memory_[player[1], player[0] - 16:player[0]] = -1 elif (player[0] < 16): coord = [0, player[1]] path_memory_[player[1], 0:player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 47): coord = [player[0] + 16, player[1]] path_memory_[player[1], player[0]:player[0] + 16] = -1 elif (player[0] > 47): coord = [63, player[1]] path_memory_[player[1], player[0]:63] = -1 #else: # rew -= 1 #else: #Cannot move, give minus reward # rew -= 1 #if(path_memory[coord[1],coord[0]] != 0): # rew -= 0.5 path_memory = np.array(path_memory_) #print("action : %s Coord : %s" % (action, coord)) if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): new_screen = shift(LEFT, player[0] - 32, new_screen) elif (player[0] < 32): new_screen = shift(RIGHT, 32 - player[0], new_screen) if (player[1] > 32): new_screen = shift(UP, player[1] - 32, new_screen) elif (player[1] < 32): new_screen = shift(DOWN, 32 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew #episode_minerals[-1] += obs[0].reward if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative + path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 32): screen = shift(LEFT, player[0] - 32, screen) elif (player[0] < 32): screen = shift(RIGHT, 32 - player[0], screen) if (player[1] > 32): screen = shift(UP, player[1] - 32, screen) elif (player[1] < 32): screen = shift(DOWN, 32 - player[1], screen) # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) path_memory = np.zeros((64, 64)) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) #mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) #logger.record_tabular("mean 100 episode mineral", mean_100ep_mineral) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.compat.v1.variable_scope('ppo2_model', reuse=tf.compat.v1.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # # Train model for training # if microbatch_size is None: # train_model = policy(nbatch_train, nsteps, sess) # else: # train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS # self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.compat.v1.placeholder(tf.float32, [None]) self.R = R = tf.compat.v1.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.compat.v1.placeholder( tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None]) self.LR = LR = tf.compat.v1.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.compat.v1.placeholder(tf.float32, []) # neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. # entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value # vpred = train_model.vf # vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value # vf_losses1 = tf.square(vpred - R) # Clipped value # vf_losses2 = tf.square(vpredclipped - R) # vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) # ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J # pg_losses = -ADV * ratio # # pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # # # Final PG loss # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) # approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) # clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # # # Total loss # loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.compat.v1.trainable_variables('ppo2_model') # # 2. Build our trainer # if MPI is not None: # self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5, use_locking=True) # else: # self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5, use_locking=True) # # 3. Calculate the gradients # grads_and_var = self.trainer.compute_gradients(loss, params) # grads, var = zip(*grads_and_var) # # if max_grad_norm is not None: # # Clip the gradients (normalize) # grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # grads_and_var = list(zip(grads, var)) # # zip aggregate each gradient with parameters associated # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # # self.grads = grads # self.var = var # with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] # self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] # self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn( *, policy_network, classifier_network, env, max_iters, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, expert_trajs_path='./expert_trajs', num_expert_trajs=500, data_subsample_freq=20, g_step=1, d_step=1, classifier_entcoeff=1e-3, num_particles=5, d_stepsize=3e-4, max_episodes=0, total_timesteps=0, # time constraint callback=None, load_path=None, save_path=None, render=False, use_classifier_logsumexp=True, use_reward_logsumexp=False, use_svgd=True, **policy_network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) entcoeff coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() if nworkers > 1: raise NotImplementedError rank = MPI.COMM_WORLD.Get_rank() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.49) U.get_session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) policy = build_policy(env, policy_network, value_network='copy', **policy_network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) D = build_classifier(env, classifier_network, num_particles, classifier_entcoeff, use_classifier_logsumexp, use_reward_logsumexp) grads_list, vars_list = D.get_grads_and_vars() if use_svgd: optimizer = SVGD( grads_list, vars_list, lambda: tf.train.AdamOptimizer(learning_rate=d_stepsize)) else: optimizer = Ensemble( grads_list, vars_list, lambda: tf.train.AdamOptimizer(learning_rate=d_stepsize)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='yellow')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='blue')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if rank == 0: saver = tf.train.Saver(var_list=get_variables("pi"), max_to_keep=10000) writer = FileWriter(os.path.join(save_path, 'logs')) stats = Statistics( scalar_keys=["average_return", "average_episode_length"]) if load_path is not None: # pi.load(load_path) saver.restore(U.get_session(), load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- if load_path is not None: seg_gen = traj_segment_generator(pi, env, 1, stochastic=False, render=render) else: seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, render=render) seg_gen_e = expert_traj_segment_generator(env, expert_trajs_path, data_subsample_freq, timesteps_per_batch, num_expert_trajs) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # nothing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) if iters_so_far % 500 == 0 and save_path is not None and load_path is None: fname = os.path.join(save_path, 'checkpoints', 'checkpoint') save_state(fname, saver, iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if load_path is not None: iters_so_far += 1 logger.record_tabular("EpRew", int(np.mean(seg["ep_true_rets"]))) logger.record_tabular("EpLen", int(np.mean(seg["ep_lens"]))) logger.dump_tabular() continue seg["rew"] = D.get_reward(seg["ob"], seg["ac"]) add_vtarg_and_adv(seg, gamma, lam) ob, ac, ep_lens, atarg, tdlamret = seg["ob"], seg["ac"], seg[ "ep_lens"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=1000): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) with timed("sample expert trajectories"): ob_a, ac_a, ep_lens_a = ob, ac, ep_lens seg_e = seg_gen_e.__next__() ob_e, ac_e, ep_lens_e = seg_e["ob"], seg_e["ac"], seg_e["ep_lens"] if hasattr(D, "rms"): obs = np.concatenate([ob_a, ob_e], axis=0) if isinstance(ac_space, spaces.Box): acs = np.concatenate([ac_a, ac_e], axis=0) D.rms.update(np.concatenate([obs, acs], axis=1)) elif isinstance(ac_space, spaces.Discrete): D.rms.update(obs) else: raise NotImplementedError with timed("SVGD"): sess = tf.get_default_session() feed_dict = { D.Xs['a']: ob_a, D.As['a']: ac_a, D.Ls['a']: ep_lens_a, D.Xs['e']: ob_e, D.As['e']: ac_e, D.Ls['e']: ep_lens_e } for _ in range(d_step): sess.run(optimizer.update_op, feed_dict=feed_dict) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_true_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() stats.add_all_summary( writer, [np.mean(rewbuffer), np.mean(lenbuffer)], iters_so_far) rewbuffer.clear() lenbuffer.clear() return pi
def learn( env, make_policy, *, n_episodes, horizon, delta, gamma, max_iters, sampler=None, use_natural_gradient=False, #can be 'exact', 'approximate' fisher_reg=1e-2, iw_method='is', iw_norm='none', bound='J', line_search_type='parabola', save_weights=0, improvement_tol=0., center_return=False, render_after=None, max_offline_iters=100, callback=None, clipping=False, entropy='none', positive_return=False, reward_clustering='none', capacity=10, warm_start=True): np.set_printoptions(precision=3) max_samples = horizon * n_episodes if line_search_type == 'binary': line_search = line_search_binary elif line_search_type == 'parabola': line_search = line_search_parabola else: raise ValueError() # Building the environment ob_space = env.observation_space ac_space = env.action_space # Creating the memory buffer memory = Memory(capacity=capacity, batch_size=n_episodes, horizon=horizon, ob_space=ob_space, ac_space=ac_space) # Building the target policy and saving its parameters pi = make_policy('pi', ob_space, ac_space) all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.split('/')[1].startswith('pol') ] shapes = [U.intprod(var.get_shape().as_list()) for var in var_list] n_parameters = sum(shapes) # Building a set of behavioral policies behavioral_policies = memory.build_policies(make_policy, pi) # Placeholders ob_ = ob = U.get_placeholder_cached(name='ob') ac_ = pi.pdtype.sample_placeholder([None], name='ac') mask_ = tf.placeholder(dtype=tf.float32, shape=(None), name='mask') rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='rew') disc_rew_ = tf.placeholder(dtype=tf.float32, shape=(None), name='disc_rew') clustered_rew_ = tf.placeholder(dtype=tf.float32, shape=(None)) gradient_ = tf.placeholder(dtype=tf.float32, shape=(n_parameters, 1), name='gradient') iter_number_ = tf.placeholder(dtype=tf.int32, name='iter_number') active_policies = tf.placeholder(dtype=tf.float32, shape=(capacity), name='active_policies') losses_with_name = [] # Total number of trajectories N_total = tf.reduce_sum(active_policies) * n_episodes # Split operations disc_rew_split = tf.reshape(disc_rew_ * mask_, [-1, horizon]) rew_split = tf.reshape(rew_ * mask_, [-1, horizon]) mask_split = tf.reshape(mask_, [-1, horizon]) # Policy densities target_log_pdf = pi.pd.logp(ac_) * mask_ target_log_pdf_split = tf.reshape(target_log_pdf, [-1, horizon]) behavioral_log_pdfs = tf.stack([ bpi.pd.logp(ac_) * mask_ for bpi in memory.policies ]) # Shape is (capacity, ntraj*horizon) behavioral_log_pdfs_split = tf.reshape(behavioral_log_pdfs, [memory.capacity, -1, horizon]) # Compute renyi divergencies and sum over time, then exponentiate emp_d2_split = tf.reshape( tf.stack([pi.pd.renyi(bpi.pd, 2) * mask_ for bpi in memory.policies]), [memory.capacity, -1, horizon]) emp_d2_split_cum = tf.exp(tf.reduce_sum(emp_d2_split, axis=2)) # Compute arithmetic and harmonic mean of emp_d2 emp_d2_mean = tf.reduce_mean(emp_d2_split_cum, axis=1) emp_d2_arithmetic = tf.reduce_sum( emp_d2_mean * active_policies) / tf.reduce_sum(active_policies) emp_d2_harmonic = tf.reduce_sum(active_policies) / tf.reduce_sum( 1 / emp_d2_mean) # Return processing: clipping, centering, discounting ep_return = clustered_rew_ #tf.reduce_sum(mask_split * disc_rew_split, axis=1) if clipping: rew_split = tf.clip_by_value(rew_split, -1, 1) if center_return: ep_return = ep_return - tf.reduce_mean(ep_return) rew_split = rew_split - (tf.reduce_sum(rew_split) / (tf.reduce_sum(mask_split) + 1e-24)) discounter = [pow(gamma, i) for i in range(0, horizon)] # Decreasing gamma discounter_tf = tf.constant(discounter) disc_rew_split = rew_split * discounter_tf # Reward statistics return_mean = tf.reduce_mean(ep_return) return_std = U.reduce_std(ep_return) return_max = tf.reduce_max(ep_return) return_min = tf.reduce_min(ep_return) return_abs_max = tf.reduce_max(tf.abs(ep_return)) return_step_max = tf.reduce_max(tf.abs(rew_split)) # Max step reward return_step_mean = tf.abs(tf.reduce_mean(rew_split)) positive_step_return_max = tf.maximum(0.0, tf.reduce_max(rew_split)) negative_step_return_max = tf.maximum(0.0, tf.reduce_max(-rew_split)) return_step_maxmin = tf.abs(positive_step_return_max - negative_step_return_max) losses_with_name.extend([(return_mean, 'InitialReturnMean'), (return_max, 'InitialReturnMax'), (return_min, 'InitialReturnMin'), (return_std, 'InitialReturnStd'), (emp_d2_arithmetic, 'EmpiricalD2Arithmetic'), (emp_d2_harmonic, 'EmpiricalD2Harmonic'), (return_step_max, 'ReturnStepMax'), (return_step_maxmin, 'ReturnStepMaxmin')]) if iw_method == 'is': # Sum the log prob over time. Shapes: target(Nep, H), behav (Cap, Nep, H) target_log_pdf_episode = tf.reduce_sum(target_log_pdf_split, axis=1) behavioral_log_pdf_episode = tf.reduce_sum(behavioral_log_pdfs_split, axis=2) # To avoid numerical instability, compute the inversed ratio log_ratio = target_log_pdf_split - behavioral_log_pdfs_split inverse_log_ratio_episode = -tf.reduce_sum(log_ratio, axis=2) iw = 1 / tf.reduce_sum(tf.exp(inverse_log_ratio_episode) * tf.expand_dims(active_policies, -1), axis=0) # Compute also the balance-heuristic weights iw_split = tf.reshape(iw, (memory.capacity, -1)) iw_by_behavioral = tf.reduce_mean(iw_split, axis=1) losses_with_name.append( (iw_by_behavioral[0] / tf.reduce_sum(iw_by_behavioral), 'MultiIWFirstRatio')) losses_with_name.append( (tf.reduce_max(iw_by_behavioral), 'MultiIWMax')) losses_with_name.append( (tf.reduce_sum(iw_by_behavioral), 'MultiIWSum')) losses_with_name.append( (tf.reduce_min(iw_by_behavioral), 'MultiIWMin')) # Get the probability by exponentiation #target_pdf_episode = tf.exp(target_log_pdf_episode) #behavioral_pdf_episode = tf.exp(behavioral_log_pdf_episode) # Get the denominator by averaging over behavioral policies #behavioral_pdf_mixture = tf.reduce_mean(behavioral_pdf_episode, axis=0) + 1e-24 #iw = target_pdf_episode / behavioral_pdf_mixture iwn = iw / n_episodes # Compute the J w_return_mean = tf.reduce_sum(ep_return * iwn) # Empirical D2 of the mixture and relative ESS ess_renyi_arithmetic = N_total / emp_d2_arithmetic ess_renyi_harmonic = N_total / emp_d2_harmonic # Log quantities losses_with_name.extend([ (tf.reduce_max(iw), 'MaxIW'), (tf.reduce_min(iw), 'MinIW'), (tf.reduce_mean(iw), 'MeanIW'), (U.reduce_std(iw), 'StdIW'), (tf.reduce_min(target_log_pdf_episode), 'MinTargetPdf'), (tf.reduce_min(behavioral_log_pdf_episode), 'MinBehavPdf'), (ess_renyi_arithmetic, 'ESSRenyiArithmetic'), (ess_renyi_harmonic, 'ESSRenyiHarmonic') ]) else: raise NotImplementedError() if bound == 'J': bound_ = w_return_mean elif bound == 'max-d2-harmonic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_harmonic)) * return_abs_max elif bound == 'max-d2-arithmetic': bound_ = w_return_mean - tf.sqrt( (1 - delta) / (delta * ess_renyi_arithmetic)) * return_abs_max else: raise NotImplementedError() # Policy entropy for exploration ent = pi.pd.entropy() meanent = tf.reduce_mean(ent) losses_with_name.append((meanent, 'MeanEntropy')) # Add policy entropy bonus if entropy != 'none': scheme, v1, v2 = entropy.split(':') if scheme == 'step': entcoeff = tf.cond(iter_number_ < int(v2), lambda: float(v1), lambda: float(0.0)) losses_with_name.append((entcoeff, 'EntropyCoefficient')) entbonus = entcoeff * meanent bound_ = bound_ + entbonus elif scheme == 'lin': ip = tf.cast(iter_number_ / max_iters, tf.float32) entcoeff_decay = tf.maximum( 0.0, float(v2) + (float(v1) - float(v2)) * (1.0 - ip)) losses_with_name.append((entcoeff_decay, 'EntropyCoefficient')) entbonus = entcoeff_decay * meanent bound_ = bound_ + entbonus elif scheme == 'exp': ent_f = tf.exp( -tf.abs(tf.reduce_mean(iw) - 1) * float(v2)) * float(v1) losses_with_name.append((ent_f, 'EntropyCoefficient')) bound_ = bound_ + ent_f * meanent else: raise Exception('Unrecognized entropy scheme.') losses_with_name.append((w_return_mean, 'ReturnMeanIW')) losses_with_name.append((bound_, 'Bound')) losses, loss_names = map(list, zip(*losses_with_name)) ''' if use_natural_gradient: p = tf.placeholder(dtype=tf.float32, shape=[None]) target_logpdf_episode = tf.reduce_sum(target_log_pdf_split * mask_split, axis=1) grad_logprob = U.flatgrad(tf.stop_gradient(iwn) * target_logpdf_episode, var_list) dot_product = tf.reduce_sum(grad_logprob * p) hess_logprob = U.flatgrad(dot_product, var_list) compute_linear_operator = U.function([p, ob_, ac_, disc_rew_, mask_], [-hess_logprob]) ''' assert_ops = tf.group(*tf.get_collection('asserts')) print_ops = tf.group(*tf.get_collection('prints')) compute_lossandgrad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses + [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_grad = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [U.flatgrad(bound_, var_list), assert_ops, print_ops]) compute_bound = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], [bound_, assert_ops, print_ops]) compute_losses = U.function([ ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies ], losses) #compute_temp = U.function([ob_, ac_, rew_, disc_rew_, clustered_rew_, mask_, iter_number_, active_policies], [log_inverse_ratio, abc, iw]) set_parameter = U.SetFromFlat(var_list) get_parameter = U.GetFlat(var_list) policy_reinit = tf.variables_initializer(var_list) if sampler is None: seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=gamma) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() U.initialize() # Starting optimizing episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=n_episodes) rewbuffer = deque(maxlen=n_episodes) while True: iters_so_far += 1 if iters_so_far == 100: print('=== CHANGED GAMMA TO 1.0') seg_gen = traj_segment_generator(pi, env, n_episodes, horizon, stochastic=True, gamma=1.0) sampler = type("SequentialSampler", (object, ), { "collect": lambda self, _: seg_gen.__next__() })() if render_after is not None and iters_so_far % render_after == 0: if hasattr(env, 'render'): render(env, pi, horizon) if callback: callback(locals(), globals()) if iters_so_far >= max_iters: print('Finished...') break logger.log('********** Iteration %i ************' % iters_so_far) theta = get_parameter() with timed('sampling'): seg = sampler.collect(theta) lens, rets = seg['ep_lens'], seg['ep_rets'] lenbuffer.extend(lens) rewbuffer.extend(rets) episodes_so_far += len(lens) timesteps_so_far += sum(lens) # Adding batch of trajectories to memory memory.add_trajectory_batch(seg) # Get multiple batches from memory seg_with_memory = memory.get_trajectories() # Get clustered reward reward_matrix = np.reshape( seg_with_memory['disc_rew'] * seg_with_memory['mask'], (-1, horizon)) ep_reward = np.sum(reward_matrix, axis=1) ep_reward = cluster_rewards(ep_reward, reward_clustering) args = ob, ac, rew, disc_rew, clustered_rew, mask, iter_number, active_policies = ( seg_with_memory['ob'], seg_with_memory['ac'], seg_with_memory['rew'], seg_with_memory['disc_rew'], ep_reward, seg_with_memory['mask'], iters_so_far, memory.get_active_policies_mask()) def evaluate_loss(): loss = compute_bound(*args) return loss[0] def evaluate_gradient(): gradient = compute_grad(*args) return gradient[0] if use_natural_gradient: def evaluate_fisher_vector_prod(x): return compute_linear_operator(x, *args)[0] + fisher_reg * x def evaluate_natural_gradient(g): return cg(evaluate_fisher_vector_prod, g, cg_iters=10, verbose=0) else: evaluate_natural_gradient = None with timed('summaries before'): logger.record_tabular("Iteration", iters_so_far) logger.record_tabular("InitialBound", evaluate_loss()) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if save_weights > 0 and iters_so_far % save_weights == 0: logger.record_tabular('Weights', str(get_parameter())) import pickle file = open('checkpoint' + str(iters_so_far) + '.pkl', 'wb') pickle.dump(theta, file) if not warm_start or memory.get_current_load() == capacity: # Optimize with timed("offline optimization"): theta, improvement = optimize_offline( theta, set_parameter, line_search, evaluate_loss, evaluate_gradient, evaluate_natural_gradient, max_offline_ite=max_offline_iters) set_parameter(theta) print(theta) with timed('summaries after'): meanlosses = np.array(compute_losses(*args)) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) else: # Reinitialize the policy tf.get_default_session().run(policy_reinit) logger.dump_tabular() env.close()
def learn(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- #seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: #data_path = '/Users/wjh720/Desktop/Tmp/para_%i/' % (timesteps_per_actorbatch / 100) data_path = '/home/icenter/tmp/openai_data/para_%i/' % (iters_so_far // 100) U.load_state(data_path + 'para') test(pi, env, timesteps_per_actorbatch, stochastic=True)
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") env = StarCraft2Env(map_name="8m", reward_only_positive=False, reward_scale_rate=200, state_last_action=True, obs_last_action=True, obs_timestep_number=True, state_timestep_number=True) #reward_defeat=-200 env_info = env.get_env_info() n_episodes = 2500 #4000 #2000 timesteps = 500000 n_agents = env_info["n_agents"] n_actions = env_info["n_actions"] output_len = n_actions lr = 0.002 buffer_size = 70000 #int(timesteps * 0.1) # 80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 local_obs_len = 179 # local obs:80 ; global state:168; global_state_len = 348 # 179+169 hidden_vector_len = 256 # 128 # 1 256 tau = 0.001 num_exploring = buffer_size # buffer_size action_low = -1 action_high = 1 save_freq = 10000 critic_output_len = 1 logdir = "tensorboard/%s/%s_lr%s/%s" % ("BicNet", timesteps, lr, start_time) Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, global_state_len, critic_output_len, hidden_vector_len, n_actions) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) action_noise = OU_noise(decay_period=timesteps - buffer_size) action_noise.reset() # model_file_load = os.path.join(str(350000) + "_" + "model_segment_training2/", "defeat_zerglings") # U.load_state(model_file_load, sess) U.initialize() t = 0 step_train = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 local_obs = env.get_obs() local_obs = np.array(local_obs) global_state = env.get_state() global_state_expand = np.zeros( [local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]]) reward_hl_own_old = [] reward_hl_en_old = [] episode_reward_agent = [0 for n in range(n_agents)] for i in range(local_obs.shape[0]): global_state_expand[i] = np.append(local_obs[i], global_state.flatten()) reward_hl_own_old.append(env.get_agent_health(i)) reward_hl_en_old.append(env.get_enemy_health(i)) while not terminated: t = t + 1 critic_input = np.expand_dims(global_state_expand, axis=0) actor_input = np.expand_dims(local_obs, axis=0) action = actor.predict(actor_input)[0] act_with_noise = action #np.clip(action + action_noise.get_noise(step_train), action_low, action_high) act_mat_norm = (act_with_noise + 1) / 2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if (sum_avail_act == 0): act_prob = (np.array(act_prob) + 1) / len(act_prob) else: act_prob = np.array(act_prob) / sum_avail_act index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) actions.append(index) if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) reward_base, terminated, info = env.step(actions) new_local_obs = env.get_obs() new_local_obs = np.array(new_local_obs) new_global_state = env.get_state() new_global_state_expand = np.zeros([ new_local_obs.shape[0], new_local_obs.shape[1] + new_global_state.shape[0] ]) reward_hl_own_new = [] reward_hl_en_new = [] for i in range(new_local_obs.shape[0]): new_global_state_expand[i] = np.append( new_local_obs[i], new_global_state.flatten()) reward_hl_own_new.append(env.get_agent_health(i)) reward_hl_en_new.append(env.get_enemy_health(i)) for i in range(n_agents): if (i in dead_unit): rew_expand[i] = 0 else: rew_expand[i] = -0.05 if (actions[i] > 5): target_id = actions[i] - 6 health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): rew_expand[i] += 2 + health_reduce_en * 5 # if (reward_base > 50): # rew_expand[i] += 20 else: rew_expand[i] += 1 else: rew_expand[i] += (reward_hl_own_new[i] - reward_hl_own_old[i]) * 5 # if (terminated): if (info["battle_won"] is False): rew_expand[i] += -10 else: rew_expand[i] += 10 episode_reward_agent[i] += rew_expand[i] replay_buffer.add(local_obs, global_state_expand, act_with_noise, rew_expand, terminated, new_local_obs, new_global_state_expand) episode_reward += reward_base local_obs = new_local_obs global_state_expand = new_global_state_expand if (t == num_exploring): print("training starts") if (t >= num_exploring): local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch( batch_size ) # [group0:[batch_size, trace.dimension], group1, ... group8] target_q = r_batch + gamma * critic.predict_target( global_s2_batch, actor.predict_target(local_s2_batch)) predicted_q_value, _ = critic.train( global_s_batch, a_batch, np.reshape(target_q, (batch_size, num_agents, critic_output_len))) a_outs = actor.predict(local_s_batch) # a_outs和a_batch是完全相同的 grads = critic.action_gradients(global_s_batch, a_outs) # delta Q对a的导数 actor.train(local_s_batch, grads) step_train = step_train + 1 actor.update_target_network() critic.update_target_network() if (t % save_freq == 0): model_file_save = os.path.join( "model/" + str(step_train) + "_" + "training_steps_model/", "8m") U.save_state(model_file_save) print("Model have been trained for %s times" % (step_train)) # replay_buffer.save() print("steps until now : %s, episode: %s, episode reward: %s" % (t, e, episode_reward)) logger.record_tabular("steps", t) logger.record_tabular("episodes", e) logger.record_tabular("reward_episode", episode_reward) for i in range(n_agents): logger.record_tabular("reward_agent_" + str(i), episode_reward_agent[i]) logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) env.close()
def enjoy(env, policy_func, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_name=None, save_per_acts=3, reload_name=None ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() if reload_name: saver = tf.train.Saver() saver.restore(tf.get_default_session(), reload_name) print("Loaded model successfully.") # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses)
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None): self.env = env self.q_func = q_func self.config = config self.callback = callback # Create all the functions necessary to train the model gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config["gpu_memory_fraction"]) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]), gamma=config["gamma"], grad_norm_clipping=10, param_noise=config["param_noise"]) act_params = { # 'make_obs_ph': make_obs_ph, # 'q_func': q_func, 'num_actions': env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the replay buffer self.config = config self.replay_buffer = None self.beta_schedule = None self.make_replay_buffer() # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.t = 0 self.episode_rewards = [0.0] self.num_episodes = 1 self.saved_mean_reward = None self.saved_episode_num = None self.episode_frames = 0 self.model_file = None self.start_time = 0 self.episode_start_time = 0
def learn( env, test_env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation # CMAES max_fitness, # has to be negative, as cmaes consider minization popsize, gensize, bounds, sigma, eval_iters, max_v_train_iter, max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) seed, env_id): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy backup_pi = policy_fn( "backup_pi", ob_space, ac_space ) # Construct a network for every individual to adapt during the es evolution reward = tf.placeholder(dtype=tf.float32, shape=[None]) # step rewards pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) old_pi_params = tf.placeholder(dtype=tf.float32, shape=[None]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule bound_coeff = tf.placeholder( name='bound_coeff', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") next_ob = U.get_placeholder_cached( name="next_ob") # next step observation for updating q function ac = U.get_placeholder_cached( name="act") # action placeholder for computing q function mean_ac = U.get_placeholder_cached( name="mean_act") # action placeholder for computing q function kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent pi_adv = pi.qpred - pi.vpred adv_mean, adv_var = tf.nn.moments(pi_adv, axes=[0]) normalized_pi_adv = (pi_adv - adv_mean) / tf.sqrt(adv_var) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * normalized_pi_adv # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * normalized_pi_adv # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) qf_loss = tf.reduce_mean( tf.square(reward + gamma * pi.mean_qpred - pi.qpred)) # qf_loss = tf.reduce_mean(U.huber_loss(pi.qpred - tf.stop_gradient(reward + gamma * pi.mean_qpred))) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) qf_losses = [qf_loss] vf_losses = [vf_loss] # pol_loss = -tf.reduce_mean(pi_adv) pol_loss = pol_surr + pol_entpen losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("vf") ] pol_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("pol") ] qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("qf") ] mean_qf_var_list = [ v for v in var_list if v.name.split("/")[1].startswith("meanqf") ] vf_lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(vf_loss, vf_var_list)]) qf_lossandgrad = U.function( [ob, ac, next_ob, mean_ac, lrmult, reward, atarg], qf_losses + [U.flatgrad(qf_loss, qf_var_list)]) qf_adam = MpiAdam(qf_var_list, epsilon=adam_epsilon) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) assign_target_q_eq_eval_q = U.function( [], [], updates=[ tf.assign(target_q, eval_q) for (target_q, eval_q) in zipsame(mean_qf_var_list, qf_var_list) ]) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) assign_backup_eq_new = U.function( [], [], updates=[ tf.assign(backup_v, newv) for ( backup_v, newv) in zipsame(backup_pi.get_variables(), pi.get_variables()) ]) assign_new_eq_backup = U.function( [], [], updates=[ tf.assign(newv, backup_v) for (newv, backup_v ) in zipsame(pi.get_variables(), backup_pi.get_variables()) ]) # Compute all losses mean_pi_actions = U.function([ob], [pi.pd.mode()]) compute_pol_losses = U.function([ob, ac, atarg, ret, lrmult], [pol_loss, pol_surr, pol_entpen, meankl]) U.initialize() get_pi_flat_params = U.GetFlat(pol_var_list) set_pi_flat_params = U.SetFromFlat(pol_var_list) vf_adam.sync() global timesteps_so_far, episodes_so_far, iters_so_far, \ tstart, lenbuffer, rewbuffer, tstart, ppo_timesteps_so_far, best_fitness episodes_so_far = 0 timesteps_so_far = 0 ppo_timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards eval_gen = traj_segment_generator_eval(pi, test_env, timesteps_per_actorbatch, stochastic=True) # For evaluation seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) # For train V Func # Build generator for all solutions actors = [] best_fitness = 0 for i in range(popsize): newActor = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, eval_gen=eval_gen) actors.append(newActor) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if max_timesteps and timesteps_so_far >= max_timesteps: print("Max time steps") break elif max_episodes and episodes_so_far >= max_episodes: print("Max episodes") break elif max_iters and iters_so_far >= max_iters: print("Max iterations") break elif max_seconds and time.time() - tstart >= max_seconds: print("Max time") break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # Generate new samples # Train V func for i in range(max_v_train_iter): logger.log("Iteration:" + str(iters_so_far) + " - sub-train iter for V func:" + str(i)) logger.log("Generate New Samples") seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) ob, ac, next_ob, atarg, reward, tdlamret, traj_idx = seg["ob"], seg["ac"], seg["next_ob"], seg["adv"], seg[ "rew"], seg["tdlamret"], \ seg["traj_index"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update( ob) # update running mean/std for normalization assign_old_eq_new( ) # set old parameter values to new parameter values # Train V function logger.log("Training V Func and Evaluating V Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *vf_losses, g = vf_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) vf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(vf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) d_q = Dataset(dict(ob=ob, ac=ac, next_ob=next_ob, reward=reward, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Re-train q function logger.log("Training Q Func Evaluating Q Func Losses") for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d_q.iterate_once(optim_batchsize): *qf_losses, g = qf_lossandgrad( batch["ob"], batch["ac"], batch["next_ob"], mean_pi_actions(batch["ob"])[0], cur_lrmult, batch["reward"], batch["atarg"]) qf_adam.update(g, optim_stepsize * cur_lrmult) losses.append(qf_losses) logger.log(fmt_row(13, np.mean(losses, axis=0))) assign_target_q_eq_eval_q() # CMAES Train Policy assign_old_eq_new() # set old parameter values to new parameter values assign_backup_eq_new() # backup current policy flatten_weights = get_pi_flat_params() opt = cma.CMAOptions() opt['tolfun'] = max_fitness opt['popsize'] = popsize opt['maxiter'] = gensize opt['verb_disp'] = 0 opt['verb_log'] = 0 opt['seed'] = seed opt['AdaptSigma'] = True es = cma.CMAEvolutionStrategy(flatten_weights, sigma, opt) while True: if es.countiter >= gensize: logger.log("Max generations for current layer") break logger.log("Iteration:" + str(iters_so_far) + " - sub-train Generation for Policy:" + str(es.countiter)) logger.log("Sigma=" + str(es.sigma)) solutions = es.ask() costs = [] lens = [] assign_backup_eq_new() # backup current policy for id, solution in enumerate(solutions): set_pi_flat_params(solution) losses = [] cost = compute_pol_losses(ob, ac, atarg, tdlamret, cur_lrmult) costs.append(cost[0]) assign_new_eq_backup() # Weights decay l2_decay = compute_weight_decay(0.99, solutions) costs += l2_decay costs, real_costs = fitness_rank(costs) es.tell_real_seg(solutions=solutions, function_values=costs, real_f=real_costs, segs=None) best_solution = es.result[0] best_fitness = es.result[1] logger.log("Best Solution Fitness:" + str(best_fitness)) set_pi_flat_params(best_solution) iters_so_far += 1 episodes_so_far += sum(lens)
def learn( env, policy_fn, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_per_iter=50, max_sample_traj=10, ckpt_dir=None, log_dir=None, task_name="origin", sample_stochastic=True, load_model_path=None, task="train"): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed clipping parameter epsilon ob_p = U.get_placeholder_cached(name="ob_physics") ob_f = U.get_placeholder_cached(name="ob_frames") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() if pi.recurrent: state_c = U.get_placeholder_cached(name="state_c") state_h = U.get_placeholder_cached(name="state_h") lossandgrad = U.function( [ob_p, ob_f, state_c, state_h, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) compute_losses = U.function( [ob_p, ob_f, state_c, state_h, ac, atarg, ret, lrmult], losses) else: lossandgrad = U.function([ob_p, ob_f, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) compute_losses = U.function([ob_p, ob_f, ac, atarg, ret, lrmult], losses) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) writer = U.FileWriter(log_dir) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) traj_gen = traj_episode_generator(pi, env, timesteps_per_actorbatch, stochastic=sample_stochastic) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" loss_stats = stats(loss_names) ep_stats = stats(["Reward", "Episode_Length", "Episode_This_Iter"]) if task == "sample_trajectory": sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic) sys.exit() while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # Save model if iters_so_far % save_per_iter == 0 and ckpt_dir is not None: U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far) logger.log2("********** Iteration %i ************" % iters_so_far) seg = seg_gen.next() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] ob_p, ob_f = zip(*ob) ob_p = np.array(ob_p) ob_f = np.array(ob_f) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if pi.recurrent: sc, sh = zip(*seg["state"]) d = Dataset(dict(ob_p=ob_p, ob_f=ob_f, ac=ac, atarg=atarg, vtarg=tdlamret, state_c=np.array(sc), state_h=np.array(sh)), shuffle=not pi.recurrent) else: d = Dataset(dict(ob_p=ob_p, ob_f=ob_f, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob_p) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log2("Optimizing...") logger.log2(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): if pi.recurrent: lg = lossandgrad(batch["ob_p"], batch["ob_f"], batch["state_c"], batch["state_h"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) else: lg = lossandgrad(batch["ob_p"], batch["ob_f"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses = lg[:-1] g = lg[-1] adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log2(fmt_row(13, np.mean(losses, axis=0))) logger.log2("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): if pi.recurrent: newlosses = compute_losses(batch["ob_p"], batch["ob_f"], batch["state_c"], batch["state_h"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) else: newlosses = compute_losses(batch["ob_p"], batch["ob_f"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log2(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() loss_stats.add_all_summary(writer, meanlosses, iters_so_far) ep_stats.add_all_summary( writer, [np.mean(rewbuffer), np.mean(lenbuffer), len(lens)], iters_so_far) return pi
def initialize(): U.initialize()
def learn( env, curEnv, policy_fn, reward_giver, expert_dataset, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- d_stepsize = 3e-4 ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) d_adam = MpiAdam(reward_giver.get_trainable_variables()) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() d_adam.sync() # Prepare for rollouts # ---------------------------------------- viewer = mujoco_py.MjViewer(env.sim) curViewer = mujoco_py.MjViewer(curEnv.sim) seg_gen = traj_segment_generator(pi, env, curEnv, viewer, curViewer, reward_giver, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(ob) batch_size = len(ob) d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (ob, ac), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(ob_batch) # update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(g, d_stepsize) d_losses.append(newlosses) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def trpo_learn( env, policy, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() # policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): # pi = policy(observ_placeholder=ob) pi = policy with tf.variable_scope("oldpi"): oldpi = create_policy(env, network='mlp', value_network='copy') atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") print("length of old_pi_var_list:", len(get_oldpi_trainable_variables("oldpi"))) print("length of pi_var_list:", len(var_list)) print("length of vf_var_list:", len(vf_var_list)) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) < 2, \ 'out of max_iters, max_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() return pi
def compete_learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=20, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-3, schedule='linear' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- # at this stage, the ob_space and reward_space is #TODO: all of this tuples are not right ? becuase items in tuple is not mutable #TODO: another way to store the two agents' states is to use with with tf.variable_scope(scope, reuse=reuse): len1 = 2 ob_space = env.observation_space.spaces ac_space = env.action_space.spaces pi = [ policy_func("pi" + str(i), ob_space[i], ac_space[i], placeholder_name="observation" + str(i)) for i in range(len1) ] oldpi = [ policy_func("oldpi" + str(i), ob_space[i], ac_space[i], placeholder_name="observation" + str(i)) for i in range(len1) ] atarg = [ tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1) ] ret = [tf.placeholder(dtype=tf.float32, shape=[None]) for i in range(len1)] tdlamret = [[] for i in range(len1)] # TODO: here I should revise lrmult to as it was before # lrmult = 1.0 # here for simple I only use constant learning rate multiplier lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult #TODO: this point I cannot finally understand it, originally it is # ob=U.get_placeholder_cached(name="ob") #TODO: here it is a bug to fix, I think the get_placeholder_cached is global, you can only cache observation once and the next time if it finds the name placeholder it will return the previous placeholder, I don't know whether different namescope have effect on this. # ob1 = U.get_placeholder_cached(name="observation1") # Note: I am not sure about this point # # ob2 = U.get_placeholder_cached(name="observation2") # ob1 = U.get_placeholder_cached(name="observation0") # Note: I am not sure about this point # ob2 = U.get_placeholder_cached(name="observation1") #TODO: the only one question now is that pi network and oldpi networ both have the ob_ph named "observation", even in the original baseline implementation, does pi and oldpi share the observation placeholder, I think it is not ob = [ U.get_placeholder_cached(name="observation" + str(i)) for i in range(len1) ] # ac = tuple([pi[i].act(stochastic=True, observation=env.observation_space[i])[0] # for i in range(len1)]) # TODO: here for the policy to work I changed the observation parameter passed into the pi function to s which comes from env.reset() # s = env.reset() # ac = tuple([pi[i].act(stochastic=True, observation=s[i])[0] # for i in range(len1)]) ac = [pi[i].pdtype.sample_placeholder([None]) for i in range(len1)] kloldnew = [oldpi[i].pd.kl(pi[i].pd) for i in range(len1)] ent = [pi[i].pd.entropy() for i in range(len1)] print("ent1 and ent2 are {} and {}".format(ent[0], ent[1])) meankl = [U.mean(kloldnew[i]) for i in range(len1)] meanent = [U.mean(ent[i]) for i in range(len1)] pol_entpen = [(-entcoeff) * meanent[i] for i in range(len1)] ratio = [ tf.exp(pi[i].pd.logp(ac[i]) - oldpi[i].pd.logp(ac[i])) for i in range(len1) ] # ratio = [tf.exp(pi[i].pd.logp(ac) - oldpi[i].pd.logp(ac[i])) for i in range(len1)] #pnew / pold surr1 = [ratio[i] * atarg[i] for i in range(len1)] # U.clip = tf.clip_by_value(t, clip_value_min, clip_value_max,name=None): # # among which t is A 'Tensor' so surr2 = [ U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param) for i in range(len1) ] pol_surr = [-U.mean(tf.minimum(surr1[i], surr2[i])) for i in range(len1)] vf_loss = [U.mean(tf.square(pi[i].vpred - ret[i])) for i in range(len1)] total_loss = [ pol_surr[i] + pol_entpen[i] + vf_loss[i] for i in range(len1) ] # here I ccome to realize that the following miscelleous losses are just operations not tensors so they should be # # be made to a list to contain the info of the two agents # surr2 = U.clip(ratio[i], 1.0 - clip_param, 1.0 + clip_param) # pol_surr = -U.mean(tf.minimum(surr1[i], surr2[i])) # vf_loss = U.mean(tf.square(pi[i].vpred - ret[i])) # total_loss = pol_surr + pol_entpen + vf_loss #TODO: in another way I choose to revise losses to following: losses = [[pol_surr[i], pol_entpen[i], vf_loss[i], meankl[i], meanent[i]] for i in range(len1)] loss_names = ["pol_sur", "pol_entpen", "vf_loss", "kl", "ent"] var_list = [pi[i].get_trainable_variables() for i in range(len1)] lossandgrad = [ U.function([ob[i], ac[i], atarg[i], ret[i], lrmult], losses[i] + [U.flatgrad(total_loss[i], var_list[i])]) for i in range(len1) ] adam = [MpiAdam(var_list[i], epsilon=adam_epsilon) for i in range(2)] #TODO: I wonder this cannot function as expected because the result is a list of functions, not will not execute automatically # assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv) # for (oldv, newv) in zipsame(oldpi[i].get_variables(), pi[i].get_variables())]) for i in range(len1)] # compute_losses is a function, so it should not be copied to copies, nevertheless the parameters should be # passed into it as the two agents compute_losses = [ U.function([ob[i], ac[i], atarg[i], ret[i], lrmult], losses[i]) for i in range(len1) ] # sess = U.get_session() # writer = tf.summary.FileWriter(logdir='log-mlp',graph=sess.graph) # now when the training iteration ends, save the trained model and test the win rate of the two. pi0_variables = slim.get_variables(scope="pi0") pi1_variables = slim.get_variables(scope="pi1") parameters_to_save_list0 = [v for v in pi0_variables] parameters_to_save_list1 = [v for v in pi1_variables] parameters_to_save_list = parameters_to_save_list0 + parameters_to_save_list1 saver = tf.train.Saver(parameters_to_save_list) restore = tf.train.Saver(parameters_to_save_list) U.initialize() restore.restore(U.get_session(), "parameter/500/500.pkl") U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(pi[1].get_variables(), pi[0].get_variables()) ])() U.get_session().run # [adam[i].sync() for i in range(2)] adam[0].sync() adam[1].sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, horizon=timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = [deque(maxlen=100) for i in range(len1)] # rolling buffer for episode lengths rewbuffer = [deque(maxlen=100) for i in range(len1)] # rolling buffer for episode rewards parameters_savers = [] assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError # saver.restore() logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() #TODO: I got to fix this function to let it return the right seg["adv"] and seg["lamret"] add_vtarg_and_adv(seg, gamma, lam) losses = [[] for i in range(len1)] meanlosses = [[] for i in range(len1)] for i in range(len1): ob[i], ac[i], atarg[i], tdlamret[i] = seg["ob"][i], seg["ac"][ i], seg["adv"][i], seg["tdlamret"][i] # ob_extend = np.expand_dims(ob[i],axis=0) # ob[i] = ob_extend vpredbefore = seg["vpred"][ i] # predicted value function before udpate atarg[i] = (atarg[i] - atarg[i].mean()) / atarg[i].std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob[i], ac=ac[i], atarg=atarg[i], vtarg=tdlamret[i]), shuffle=not pi[i].recurrent) optim_batchsize = optim_batchsize or ob[i].shape[0] if hasattr(pi[i], "ob_rms"): pi[i].ob_rms.update( ob[i]) # update running mean/std for policy #TODO: I have to make suer how assign_old_ea_new works and whether to assign it for each agent #Yes I can assure it will work now # save network parameters using tf.train.Saver # saver_name = "saver" + str(iters_so_far) U.function([], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( oldpi[i].get_variables(), pi[i].get_variables()) ])() # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data logger.log("Optimizing the agent{}...".format(i)) logger.log(fmt_row(13, loss_names)) for _ in range(optim_epochs): losses[i] = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # batch["ob"] = np.expand_dims(batch["ob"], axis=0) *newlosses, g = lossandgrad[i](batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam[i].update(g, optim_stepsize * cur_lrmult) losses[i].append(newlosses) logger.log(fmt_row(13, np.mean(losses[i], axis=0))) logger.log("Evaluating losses of agent{}...".format(i)) losses[i] = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses[i](batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses[i].append(newlosses) meanlosses[i], _, _ = mpi_moments(losses[i], axis=0) logger.log(fmt_row(13, meanlosses[i])) for (lossval, name) in zipsame(meanlosses[i], loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before{}".format(i), explained_variance(vpredbefore, tdlamret[i])) lrlocal = (seg["ep_lens"][i], seg["ep_rets"][i]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer[i].extend(lens) rewbuffer[i].extend(rews) logger.record_tabular("EpLenMean {}".format(i), np.mean(lenbuffer[i])) logger.record_tabular("EpRewMean {}".format(i), np.mean(rewbuffer[i])) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 temp_pi = policy_func("temp_pi" + str(iters_so_far), ob_space[0], ac_space[0], placeholder_name="temp_pi_observation" + str(iters_so_far)) U.function([], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( temp_pi.get_variables(), pi[0].get_variables()) ])() parameters_savers.append(temp_pi) # now I think when the if iters_so_far % 3 == 0: sample_iteration = int( np.random.uniform(iters_so_far / 2, iters_so_far)) print("now assign the {}th parameter of agent0 to agent1".format( sample_iteration)) pi_restore = parameters_savers[sample_iteration] U.function([], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(pi[1].get_variables(), pi_restore.get_variables()) ])() logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() # # now when the training iteration ends, save the trained model and test the win rate of the two. # pi0_variables = slim.get_variables(scope = "pi0") # pi1_variables = slim.get_variables(scope = "pi1") # parameters_to_save_list0 = [v for v in pi0_variables] # parameters_to_save_list1 = [v for v in pi1_variables] # parameters_to_save_list = parameters_to_save_list0 + parameters_to_save_list1 # saver = tf.train.Saver(parameters_to_save_list) # parameters_path = 'parameter/' # tf.train.Saver() save_path = saver.save(U.get_session(), "parameter/800/800.pkl")
def main(): start_time = datetime.datetime.now().strftime("%Y%m%d%H%M") env = StarCraft2Env(map_name="8m", reward_only_positive=False, reward_scale_rate=200, state_last_action=True, obs_last_action=True, obs_timestep_number=True, state_timestep_number=True) env_info = env.get_env_info() n_episodes = 100 #4000 #2000 timesteps = 700000 n_agents = env_info["n_agents"] n_actions= env_info["n_actions"] output_len = n_actions lr = 0.002 buffer_size = int(timesteps * 0.1 / 70) # 80000 # 减少一下,尽量是训练步数的1/10 70000 test 200 80000 20000 batch_size = 32 # 32 gamma = 0.99 num_agents = 8 local_obs_len = 179 # local obs:80 ; global state:168; global_state_len = 348 # 179+169 hidden_vector_len = 256 # 128 # 1 256 tau = 0.001 num_exploring = buffer_size # buffer_size action_low = -1 action_high = 1 # save_freq = 1000 #10000 critic_output_len = 1 # logdir = "tensorboard/%s/%s_lr%s/%s" % ( # "BicNet", # timesteps, # lr, # start_time # ) # # Logger.DEFAULT \ # = Logger.CURRENT \ # = Logger(dir=None, # output_formats=[TensorBoardOutputFormat(logdir)]) sess = U.make_session() sess.__enter__() actor = ActorNetwork(sess, lr, tau, batch_size, num_agents, local_obs_len, output_len, hidden_vector_len) critic = CriticNetwork(sess, lr, tau, actor.get_num_trainable_vars(), num_agents, global_state_len, critic_output_len, hidden_vector_len, n_actions) sess.run(tf.global_variables_initializer()) replay_buffer = ReplayBuffer(buffer_size) # replay_buffer = ReplayBuffer(buffer_size) # action_noise = OU_noise(decay_period=timesteps - buffer_size) # action_noise.reset() U.initialize() model_load_steps = 190001 model_file_load = os.path.join("model/"+str(model_load_steps) + "_" + "training_steps_model/", "8m") U.load_state(model_file_load, sess) print("model trained for %s steps have been loaded" % (model_load_steps)) t = 0 # step_train = 0 for e in range(n_episodes): env.reset() terminated = False episode_reward = 0 local_obs = env.get_obs() local_obs = np.array(local_obs) global_state = env.get_state() global_state_expand = np.zeros([local_obs.shape[0], local_obs.shape[1] + global_state.shape[0]]) reward_hl_own_old = [] reward_hl_en_old = [] episode_reward_agent = [0 for n in range(n_agents)] for i in range(local_obs.shape[0]): global_state_expand[i] = np.append(local_obs[i], global_state.flatten()) reward_hl_own_old.append(env.get_agent_health(i)) reward_hl_en_old.append(env.get_enemy_health(i)) while not terminated: t = t+1 critic_input = np.expand_dims(global_state_expand, axis=0) actor_input = np.expand_dims(local_obs, axis=0) action = actor.predict(actor_input)[0] act_with_no_noise = np.clip(action, action_low, action_high) act_mat_norm = (act_with_no_noise+1)/2 actions = [] dead_unit = [] rew_expand = np.zeros((n_agents, 1)) for agent_id in range(n_agents): sum_avail_act = 0 act_prob = [] avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] act_unit_norm = act_mat_norm[agent_id] for i in avail_actions_ind: act_prob.append(act_unit_norm[i]) sum_avail_act = sum_avail_act + act_unit_norm[i] if(sum_avail_act == 0): act_prob = (np.array(act_prob) + 1)/len(act_prob) else : act_prob = np.array(act_prob)/sum_avail_act # index = np.random.choice(np.array(avail_actions_ind), p=act_prob.ravel()) index = avail_actions_ind[np.argmax(act_prob)] actions.append(index) if(len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): dead_unit.append(agent_id) reward_base, terminated, _ = env.step(actions) new_local_obs = env.get_obs() new_local_obs = np.array(new_local_obs) new_global_state= env.get_state() new_global_state_expand = np.zeros([new_local_obs.shape[0], new_local_obs.shape[1] + new_global_state.shape[0]]) reward_hl_own_new = [] reward_hl_en_new = [] for i in range(new_local_obs.shape[0]): new_global_state_expand[i] = np.append(new_local_obs[i], new_global_state.flatten()) reward_hl_own_new.append(env.get_agent_health(i)) reward_hl_en_new.append(env.get_enemy_health(i)) for i in range(n_agents): if (i in dead_unit): rew_expand[i] = 0 elif (actions[i] > 5): target_id = actions[i] - 6 health_reduce_en = reward_hl_en_old[target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): if (reward_base > 0): rew_expand[i] = 2 + reward_base else: rew_expand[i] = 2 else: rew_expand[i] = 1 else: rew_expand[i] = (reward_hl_own_new[i] - reward_hl_own_old[i]) * 5 episode_reward_agent[i] += rew_expand[i] # replay_buffer.add(local_obs, global_state_expand, act_with_no_noise, rew_expand, terminated, new_local_obs, new_global_state_expand) episode_reward += reward_base local_obs = new_local_obs global_state_expand = new_global_state_expand # if (t >= num_exploring): # local_s_batch, global_s_batch, a_batch, r_batch, done_batch, local_s2_batch, global_s2_batch = replay_buffer.sample_batch( # batch_size) # [group0:[batch_size, trace.dimension], group1, ... group8] # target_q = r_batch + gamma * critic.predict_target(global_s2_batch, # actor.predict_target(local_s2_batch)) # a_q = target_q print("Total reward in episode {} = {}".format(e, episode_reward)) # logger.record_tabular("steps", t) # logger.record_tabular("episodes", e) # logger.record_tabular("reward_episode", episode_reward) # for i in range(n_agents): # logger.record_tabular("reward_agent_"+str(i), episode_reward_agent[i]) # # logger.dump_tabular() # model_file_save = os.path.join(str(t) + "_" + "model_segment_training/", "defeat_zerglings") # U.save_state(model_file_save) print('game over') print(env.get_stats()) env.close()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldlogit = np.concatenate([path["logits"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldlogit) if kl > desired_kl * 2: logger.log("kl too high") U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5))) elif kl < desired_kl / 2: logger.log("kl too low") U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5))) else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) eprewmean = np.mean([path["reward"].sum() for path in paths]) if callback is not None: callback(locals(), globals()) logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def _build_model( self, input_space, action_size, policy_func, clip_param=0.2, entcoeff=0.01, # clipping parameter epsilon, entropy coeff adam_epsilon=1e-5): sess = U.get_session() if sess is None: sess = U.make_session(8) sess.__enter__() # Setup losses and stuff # ---------------------------------------- with tf.variable_scope(self.scope): self.pi = policy_func( "pi", input_space, action_size) # Construct network for new policy self.oldpi = policy_func("oldpi", input_space, action_size) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = self.pi.pdtype.sample_placeholder([None]) kloldnew = self.oldpi.pd.kl(self.pi.pd) ent = self.pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(self.pi.pd.logp(ac) - self.oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(self.pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss var_list = self.pi.get_trainable_variables() # more debug info debug_atarg = atarg pi_ac = self.pi.pd.logp(ac) opi_ac = self.oldpi.pd.logp(ac) vpred = U.mean(self.pi.vpred) pi_pd = U.mean(self.pi.pd.flatparam()) opi_pd = self.oldpi.pd.flatparam()[0] kl_oldnew = kloldnew[0] grads = tf.gradients(total_loss, var_list) losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] debugs = [ debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew, total_loss ] self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + debugs + [var_list, grads] + [U.flatgrad(total_loss, var_list)]) self.adam = MpiAdam(var_list, epsilon=adam_epsilon) self.assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( self.oldpi.get_variables(), self.pi.get_variables()) ]) self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() self.adam.sync()
def learn( env, policy_fn, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) resume_training=False, ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( name="atarg", dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(name="ret", dtype=tf.float32, shape=[None]) # Empirical return summ_writer = tf.summary.FileWriter("/tmp/tensorboard", U.get_session().graph) U.launch_tensorboard_in_background("/tmp/tensorboard") lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ob_2 = U.get_placeholder_cached(name="ob_2") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] check_ops = tf.add_check_numerics_ops() var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function( [ob, ob_2, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)], updates=[check_ops], ) debugnan = U.function([ob, ob_2, ac, atarg, ret, lrmult], losses + [ratio, surr1, surr2]) dbgnames = loss_names + ["ratio", "surr1", "surr2"] assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ob_2, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" if resume_training: pi.load_variables("/tmp/rlnav_model") oldpi.load_variables("/tmp/rlnav_model") else: # clear reward history log with open(rew_hist_filepath, 'w') as f: f.write('') while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ob_2, ac, atarg, tdlamret = seg["ob"], seg["ob_2"], seg["ac"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate # flatten first two dimensions and put into batch maker dataset_dict = dict(ob=ob, ob_2=ob_2, ac=ac, atarg=atarg, vtarg=tdlamret) def flatten_horizon_and_agent_dims(array): """ using F order because we assume that the shape is (horizon, n_agents) and we want the new flattened first dimension to first run through horizon, then n_agents in order to not cut up the sequentiality of the experiences """ new_shape = (array.shape[0] * array.shape[1], ) + array.shape[2:] return array.reshape(new_shape, order='F') for key in dataset_dict: dataset_dict[key] = flatten_horizon_and_agent_dims( dataset_dict[key]) d = Dataset(dataset_dict, shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] n_agents = ob.shape[1] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy #export rewards to log file rewplot = np.array(seg["rew"]) with open(rew_hist_filepath, 'ab') as f: np.savetxt(f, rewplot, delimiter=',') assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): landg = lossandgrad(batch["ob"], batch["ob_2"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) newlosses, g = landg[:-1], landg[-1] # debug nans if np.any(np.isnan(newlosses)): dbglosses = debugnan(batch["ob"], batch["ob_2"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) raise ValueError("Nan detected in losses: {} {}".format( dbgnames, dbglosses)) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) pi.save_variables("/tmp/rlnav_model") pi.load_variables("/tmp/rlnav_model") logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ob_2"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular( "ev_tdlam_before", np.average([ explained_variance(vpredbefore[:, i], tdlamret[:, i]) for i in range(n_agents) ])) # average of explained variance for each agent # lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values # listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples # lens, rews = map(flatten_lists, zip(*listoflrpairs)) lens, rews = (seg["ep_lens"], seg["ep_rets"]) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) tf.summary.scalar("EpLenMean", np.mean(lenbuffer)) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return pi
def learn(env, q_func, num_actions=4, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((32, 32), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, scope="deepq") # # act_y, train_y, update_target_y, debug_y = deepq.build_train( # make_obs_ph=make_obs_ph, # q_func=q_func, # num_actions=num_actions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # gamma=gamma, # grad_norm_clipping=10, # scope="deepq_y" # ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters, # initial_p=prioritized_replay_beta0, # final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) # replay_buffer_y = ReplayBuffer(buffer_size) beta_schedule = None # beta_schedule_y = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() # update_target_y() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first obs = env.step( actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])]) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype(int) #+ path_memory player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): screen = shift(LEFT, player[0] - 16, screen) elif (player[0] < 16): screen = shift(RIGHT, 16 - player[0], screen) if (player[1] > 16): screen = shift(UP, player[1] - 16, screen) elif (player[1] < 16): screen = shift(DOWN, 16 - player[1], screen) reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join("model/", "mineral_shards") print(model_file) for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(screen)[None], update_eps=update_eps, **kwargs)[0] # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False coord = [player[0], player[1]] rew = 0 if (action == 0): #UP if (player[1] >= 8): coord = [player[0], player[1] - 8] #path_memory_[player[1] - 16 : player[1], player[0]] = -1 elif (player[1] > 0): coord = [player[0], 0] #path_memory_[0 : player[1], player[0]] = -1 #else: # rew -= 1 elif (action == 1): #DOWN if (player[1] <= 23): coord = [player[0], player[1] + 8] #path_memory_[player[1] : player[1] + 16, player[0]] = -1 elif (player[1] > 23): coord = [player[0], 31] #path_memory_[player[1] : 63, player[0]] = -1 #else: # rew -= 1 elif (action == 2): #LEFT if (player[0] >= 8): coord = [player[0] - 8, player[1]] #path_memory_[player[1], player[0] - 16 : player[0]] = -1 elif (player[0] < 8): coord = [0, player[1]] #path_memory_[player[1], 0 : player[0]] = -1 #else: # rew -= 1 elif (action == 3): #RIGHT if (player[0] <= 23): coord = [player[0] + 8, player[1]] #path_memory_[player[1], player[0] : player[0] + 16] = -1 elif (player[0] > 23): coord = [31, player[1]] #path_memory_[player[1], player[0] : 63] = -1 if _MOVE_SCREEN not in obs[0].observation["available_actions"]: obs = env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) new_action = [ sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord]) ] # else: # new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] if (player[0] > 16): new_screen = shift(LEFT, player[0] - 16, new_screen) elif (player[0] < 16): new_screen = shift(RIGHT, 16 - player[0], new_screen) if (player[1] > 16): new_screen = shift(UP, player[1] - 16, new_screen) elif (player[1] < 16): new_screen = shift(DOWN, 16 - player[1], new_screen) rew = obs[0].reward done = obs[0].step_type == environment.StepType.LAST # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = (player_relative == _PLAYER_NEUTRAL).astype( int) #+ path_memory player_y, player_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() player = [int(player_x.mean()), int(player_y.mean())] # Select all marines first env.step(actions=[ sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) ]) episode_rewards.append(0.0) #episode_minerals.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size) # weights_y, batch_idxes_y = np.ones_like(rewards_y), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps # new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # update_target_y() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, gamma=0.99, sil_update=1, fn_reward=None, fn_obs=None, sil_value=0.01, superv_coef=0.01, sil_alpha=0.6, sil_beta=0.1): self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) sil_model = policy(sess=sess) sil_model_2 = policy(sess=sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.sil = SelfImitation(sil_model.X, sil_model.vf, sil_model.entropy, sil_model.action, sil_model.value, sil_model.neg_log_prob, ob_space, ac_space, sil_model_2.X, sil_model_2.vf, fn_reward=fn_reward, fn_obs=fn_obs, n_env=nbatch_act, n_update=sil_update, w_value=sil_value, w_superv=superv_coef, w_entropy=ent_coef, gamma=gamma, max_steps=50000, max_nlogp=100, alpha=sil_alpha, beta=sil_beta) self.sil.set_loss_weight(0.1) self.sil.build_train_op(params, self.trainer) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular()
def learn(env, eval_env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, evaluation_freq, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, num_epochs=1000, callback=None): # configure log logger.configure(dir=log_dir) nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) for epoch in range(num_epochs): if callback: callback(locals(), globals()) # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Epoch %i ************" % epoch) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") total_obs = [] total_acs = [] total_ep_rets = [] total_ep_lens = [] total_ep_true_rets = [] for g_step_num in range(g_step): with timed("sampling"): seg = seg_gen.__next__() # Add seg into total_seg total_obs.append(seg["ob"]) total_acs.append(seg["ac"]) total_ep_rets.append(seg["ep_rets"]) total_ep_lens.append(seg["ep_lens"]) total_ep_true_rets.append(seg["ep_true_rets"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms"): pi.ob_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) # Evaluate current policy if (g_step * epoch + g_step_num) % evaluation_freq == 0: evaluate_policy(pi, reward_giver, eval_env, g_step * epoch + g_step_num, timesteps_per_batch, tstart) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") total_obs = np.vstack(total_obs) total_acs = np.vstack(total_acs) total_ep_rets = np.concatenate(total_ep_rets) total_ep_lens = np.concatenate(total_ep_lens) total_ep_true_rets = np.concatenate(total_ep_true_rets) logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(total_obs)) batch_size = len(total_obs) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (total_obs, total_acs), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # Update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
def learn(env, policy_func, *, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None ): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) entbonus = entcoeff * meanent vferr = U.mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = U.mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): self.max_grad_norm = max_grad_norm self.head_idx_current_batch = 0 self.critic_idx_current_batch = 0 sess = tf.compat.v1.get_default_session() self.running_stats_s = RunningStats() self.running_stats_s_ = RunningStats() self.running_stats_r = RunningStats() self.running_stats_r_i = RunningStats() train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, max_grad_norm) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, max_grad_norm) self.train_model = train_model # in case we don't use rep loss rep_loss = None # HEAD_IDX = tf.compat.v1.placeholder(tf.int32, [None]) A = train_model.pdtype.sample_placeholder([None],name='A') A_i = train_model.A_i LATENT_FACTORS = train_model.pdtype.sample_placeholder([Config.REP_LOSS_M,Config.POLICY_NHEADS,None,count_latent_factors(Config.ENVIRONMENT)],name='LATENT_FACTORS') ADV = tf.compat.v1.placeholder(tf.float32, [None],name='ADV') R = tf.compat.v1.placeholder(tf.float32, [None],name='R') R_NCE = tf.compat.v1.placeholder(tf.float32, [Config.REP_LOSS_M,1,None],name='R_NCE') OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None],name='OLDNEGLOGPAC') OLDNEGLOGPAC_i = tf.compat.v1.placeholder(tf.float32, [None],name='OLDNEGLOGPAC_i') LR = tf.compat.v1.placeholder(tf.float32, [],name='LR') CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [],name='CLIPRANGE') if Config.CUSTOM_REP_LOSS: ADV_i= tf.compat.v1.placeholder(tf.float32, [None]) R_i = tf.compat.v1.placeholder(tf.float32, [None]) OLDVPRED_i = tf.compat.v1.placeholder(tf.float32, [None]) vpred_i = train_model.vf_i_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not vpredclipped_i = OLDVPRED_i + tf.clip_by_value(vpred_i - OLDVPRED_i, - CLIPRANGE, CLIPRANGE) vf_losses1_i = tf.square(vpred_i - R_i) vf_losses2_i = tf.square(vpredclipped_i - R_i) vf_loss_i = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1_i, vf_losses2_i)) # ADV = ADV + ADV_i # TD loss for critic # VF loss OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None],name='OLDVPRED') vpred = train_model.vf_train # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1: vpred = vpred[self.critic_idx_current_batch] vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(input_tensor=tf.maximum(vf_losses1, vf_losses2)) neglogpac_train = train_model.pd_train[0].neglogp(A) ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train) pg_losses_train = -ADV * ratio_train pg_losses2_train = -ADV * tf.clip_by_value(ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(input_tensor=tf.maximum(pg_losses_train, pg_losses2_train)) approxkl_train = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC)) clipfrac_train = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE), dtype=tf.float32)) if Config.CUSTOM_REP_LOSS: neglogpac_train_i = train_model.pd_train_i.neglogp(A_i[:,0,self.head_idx_current_batch]) ratio_train_i = tf.exp(OLDNEGLOGPAC_i - neglogpac_train_i) pg_losses_train_i = -ADV_i * ratio_train_i pg_losses2_train_i = -ADV_i * tf.clip_by_value(ratio_train_i, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss_i = tf.reduce_mean(input_tensor=tf.maximum(pg_losses_train_i, pg_losses2_train_i)) else: pg_loss_i = tf.constant(0.,dtype=tf.float32) if Config.BETA >= 0: entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0]._components_distribution.entropy()) else: entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0].entropy()) # Add entropy and policy loss for the samples as well if Config.SNI or Config.SNI2: neglogpac_run = train_model.pd_run.neglogp(A) ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run) pg_losses_run = -ADV * ratio_run pg_losses2_run = -ADV * tf.clip_by_value(ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss += tf.reduce_mean(input_tensor=tf.maximum(pg_losses_run, pg_losses2_run)) pg_loss /= 2. entropy += tf.reduce_mean(input_tensor=train_model.pd_run.entropy()) entropy /= 2. approxkl_run = .5 * tf.reduce_mean(input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC)) clipfrac_run = tf.reduce_mean(input_tensor=tf.cast(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE), dtype=tf.float32)) else: approxkl_run = tf.constant(0.) clipfrac_run = tf.constant(0.) params = tf.compat.v1.trainable_variables() weight_params = [v for v in params if '/b' not in v.name] total_num_params = 0 for p in params: shape = p.get_shape().as_list() num_params = np.prod(shape) mpi_print('param', p, num_params) total_num_params += num_params mpi_print('total num params:', total_num_params) l2_loss = tf.reduce_sum(input_tensor=[tf.nn.l2_loss(v) for v in weight_params]) # The first occurance should be in the train_model if Config.BETA >= 0: info_loss = tf.compat.v1.get_collection( key="INFO_LOSS", scope="model/info_loss" ) beta = Config.BETA elif Config.BETA_L2A >= 0: info_loss = tf.compat.v1.get_collection( key="INFO_LOSS_L2A", scope="model/info_loss" ) beta = Config.BETA_L2A else: info_loss = [tf.constant(0.)] beta = 0 # print(info_loss) assert len(info_loss) == 1 info_loss = info_loss[0] if Config.CUSTOM_REP_LOSS: rep_loss = tf.reduce_mean(train_model.rep_loss) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss if Config.SYNC_FROM_ROOT: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) trainer_pse = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) self.opt = trainer grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) tot_norm = tf.zeros((1,)) for g,v in grads_and_var: tot_norm += tf.norm(g) tot_norm = tf.reshape(tot_norm, []) _train = trainer.apply_gradients(grads_and_var) grads_and_var_pse = trainer_pse.compute_gradients(train_model.contrastive_loss, params) grads_pse, var_pse = zip(*grads_and_var_pse) if max_grad_norm is not None: grads_pse, _grad_norm = tf.clip_by_global_norm(grads_pse, max_grad_norm) grads_and_var_pse = list(zip(grads_pse, var_pse)) _train_pse = trainer_pse.apply_gradients(grads_and_var_pse) def train(lr, cliprange, obs, returns, masks, actions, infos, values, neglogpacs, rewards, mb_obs_1, mb_actions_1, mb_rewards_1, mb_obs_2, mb_actions_2, mb_rewards_2, train_target='policy'): values = values[:,self.critic_idx_current_batch] if Config.CUSTOM_REP_LOSS else values advs = returns - values adv_mean = np.mean(advs, axis=0, keepdims=True) adv_std = np.std(advs, axis=0, keepdims=True) advs = (advs - adv_mean) / (adv_std + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, train_model.R:rewards, train_model.A:actions, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values, train_model.pse_obs_1:mb_obs_1, train_model.pse_actions_1:mb_actions_1, train_model.pse_rewards_1:mb_rewards_1, train_model.pse_obs_2:mb_obs_2, train_model.pse_actions_2:mb_actions_2, train_model.pse_rewards_2:mb_rewards_2 } # import ipdb;ipdb.set_trace() if train_target == 'policy': rets = sess.run( [pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train, approxkl_run, clipfrac_run, l2_loss, info_loss, tot_norm, _train],td_map)[:-1] return rets elif train_target == 'pse': rets = sess.run( [_train_pse],td_map)[:-1] return rets self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train', 'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss', 'info_loss_cv', 'rep_loss', 'value_i_loss', 'policy_loss_i','gradient_norm'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load self.rep_vec = act_model.rep_vec self.custom_train = train_model.custom_train if Config.SYNC_FROM_ROOT: if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="") sess.run(tf.compat.v1.global_variables_initializer()) sync_from_root(sess, global_variables) #pylint: disable=E1101 else: initialize()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act
def learn( env, var_func, cvar_func, nb_atoms, run_alpha=None, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=0.95, target_network_update_freq=500, num_cpu=4, callback=None, periodic_save_freq=1000000, periodic_save_path=None, grad_norm_clip=None, ): """Train a CVaR DQN model. Parameters ------- env: gym.Env environment to train on var_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. cvar_func: function same as var_func nb_atoms: int number of atoms used in CVaR discretization run_alpha: float optimize CVaR_alpha while running. None if you want random alpha each episode. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the best model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. periodic_save_freq: int How often do we save the model - periodically periodic_save_path: str Where do we save the model - periodically grad_norm_clip: float Clip gradient to this value. No clipping if None Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/distdeepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = make_session(num_cpu=num_cpu) sess.__enter__() obs_space_shape = env.observation_space.shape def make_obs_ph(name): return U.BatchInput(obs_space_shape, name=name) act, train, update_target, debug = build_train( make_obs_ph=make_obs_ph, var_func=var_func, cvar_func=cvar_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, nb_atoms=nb_atoms, grad_norm_clipping=grad_norm_clip) act_params = { 'make_obs_ph': make_obs_ph, 'cvar_func': cvar_func, 'var_func': var_func, 'num_actions': env.action_space.n, 'nb_atoms': nb_atoms } # Create the replay buffer replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True episode = 0 alpha = 1. # --------------------------------- RUN --------------------------------- with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): print('Target reached') model_saved = False break # Take action and update exploration to the newest value update_eps = exploration.value(t) update_param_noise_threshold = 0. action = act(np.array(obs)[None], alpha, update_eps=update_eps)[0] reset = False new_obs, rew, done, _ = env.step(action) # ===== DEBUG ===== # s = np.ones_like(np.array(obs)[None]) # a = np.ones_like(act(np.array(obs)[None], run_alpha, update_eps=update_eps)) # r = np.array([0]) # s_ = np.ones_like(np.array(obs)[None]) # d = np.array([False]) # s = obs[None] # a = np.array([action]) # r = np.array([rew]) # s_ = new_obs[None] # d = np.array([done]) # if t % 100 == 0: # for f in debug: # print(f(s, a, r, s_, d)) # print('-------------') # # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/cvar_func')]) # # print([sess.run(v) for v in tf.global_variables('cvar_dqn/var_func')]) # ================= # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if run_alpha is None: alpha = np.random.random() if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() # Log results and periodically save the model mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.record_tabular("(current alpha)", "%.2f" % alpha) logger.dump_tabular() # save and report best model if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # save periodically if periodic_save_freq is not None and periodic_save_path is not None and t > learning_starts: if t % periodic_save_freq == 0: ActWrapper(act, act_params).save("{}-{}.pkl".format( periodic_save_path, int(t / periodic_save_freq))) if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return ActWrapper(act, act_params)
) approximate_num_iters = args.num_steps / 4 exploration = PiecewiseSchedule([ (0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01) ], outside_value=0.01) if args.prioritized: replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha) beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(args.replay_buffer_size) U.initialize() update_target() num_iters = 0 # Load the model state = maybe_load_model(savedir, container) if state is not None: num_iters, replay_buffer = state["num_iters"], state["replay_buffer"], monitored_env.set_state(state["monitor_state"]) start_time, start_steps = None, None steps_per_iter = RunningAvg(0.999) iteration_time_est = RunningAvg(0.999) obs = env.reset() num_iters_since_reset = 0 reset = True
def learn(env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, ckpt_dir, timesteps_per_batch, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, rnd_iter=200, callback=None, dyn_norm=False, mmd=False): nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = pi.vlossandgrad def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_variables(pretrained_weight, variables=pi.get_variables()) else: if not dyn_norm: pi.ob_rms.update(expert_dataset[0]) if not mmd: reward_giver.train(*expert_dataset, iter=rnd_iter) #inspect the reward learned # for batch in iterbatches(expert_dataset, batch_size=32): # print(reward_giver.get_reward(*batch)) best = -2000 save_ind = 0 max_save = 3 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ # logger.log("Optimizing Policy...") for _ in range(g_step): seg = seg_gen.__next__() #mmd reward if mmd: reward_giver.set_b2(seg["ob"], seg["ac"]) seg["rew"] = reward_giver.get_reward(seg["ob"], seg["ac"]) #report stats and save policy if any good lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"] ) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs)) true_rewbuffer.extend(true_rets) lenbuffer.extend(lens) rewbuffer.extend(rews) true_rew_avg = np.mean(true_rewbuffer) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpTrueRewMean", true_rew_avg) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("Best so far", best) # Save model if ckpt_dir is not None and true_rew_avg >= best: best = true_rew_avg fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) pi.save_policy(fname + "_" + str(save_ind)) save_ind = (save_ind + 1) % max_save #compute gradient towards next policy add_vtarg_and_adv(seg, gamma, lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms") and dyn_norm: pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=False) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) if pi.use_popart: pi.update_popart(tdlamret) for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "ob_rms") and dyn_norm: pi.ob_rms.update( mbob) # update running mean/std for policy vfadam.update(allmean(compute_vflossandgrad(mbob, mbret)), vf_stepsize) g_losses = meanlosses for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) if rank == 0: logger.dump_tabular()