def ppo(env_fn, GUI=True, actor_critic=my_mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, on_policy=True, prev_epochs=0): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. GUI : Whether or not display GUI during training. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) if GUI: env = env_fn("GUI", prev_epochs) else: env = env_fn("DIRECT", prev_epochs) obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space sess = tf.Session() # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # Main outputs from computation graph pi, logp, logp_pi, v, mu, log_std = actor_critic(x_ph, a_ph, **ac_kwargs) # if load_path==None: # # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) # # Main outputs from computation graph # pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # else: # fname = osp.join(load_path, 'tf1_save') # print('\n\nLoading old model from %s.\n\n' % fname) # # # load the things! # model = restore_tf_graph(sess, fname) # x_ph, a_ph = model['x'], model['a'] # pi, logp, logp_pi, v = model['pi'], model['logp'], model['logp_pi'], model['v'] # Calculated through one epoch, assigned by buf's methods adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi }) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs) # logp is basically the same as logp_old_ph, the error starts from 1e-6, # and this error is a little strange... # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): last_noise_time = 0.0 noise = np.zeros(12) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape( 1, -1)}) # CHANGE THE feed_dict HERE! # aa = a.copy() # if 2.0 < env.t < 4.0: # # on_policy = False # if env.t - last_noise_time > 0.1: # noise = np.random.uniform(-0.5 * np.pi, 0.5 * np.pi, 12) # last_noise_time += 0.1 # a += noise # logp_t = sess.run(logp, feed_dict={x_ph: o.reshape(1, -1), a_ph: a}) # else: # # on_policy = True # pass # print("time:", env.t, a-aa) if not on_policy: a = np.array([get_action_from_target_policy(env.t)]) logp_t = sess.run(logp, feed_dict={ x_ph: o.reshape(1, -1), a_ph: a }) env.history_buffer['last_action'] = a[0] for i in range( 25): # Change the frequency of control from 500Hz to 20Hz o2, r, d, o2_dict = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 # print(ep_len, d) terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = 0 # print(o2_dict['position']) # print(np.alltrue(o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]) is True) # print(np.alltrue([o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]])) # print("I did it!!!") else: # last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = 0 buf.finish_path(last_val) print(ep_ret) # logger.store(EpRet=ep_ret+last_val, EpLen=ep_len) # if terminal: # o, ep_ret, ep_len = env.reset(), 0, 0 if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 last_noise_time = 0.0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() env.addEpoch() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # show the log if time.ctime()[-13:-11] == '09': break env.close()
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative', pretrain_bc_itr =0): """ additional args d_hidden_size : hidden layer size of Discriminator d_batch_size : Discriminator's batch size r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment d_itr : The number of iteration of update discriminater reward_type : GAIL reward has three type ['negative','positive', 'AIRL'] trj_num :the number of trajectory for pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type) e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',') e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) for _ in range(train_v_iters):#vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std = std) start_time = time.time() o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0 if pretrain_bc_itr>0: BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) buf.store_rew(r) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if d:# if trajectory didn't reach terminal state, bootstrap value target last_val = r else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use buf.finish_path() if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail) o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, epoch) agent_obs , agent_act = buf.obs_buf, buf.act_buf d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator for _t in range(d_itr): e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size) a_obs_batch =sample_batch(agent_obs,batch_size = d_batch_size) a_act_batch= sample_batch(agent_act,batch_size = d_batch_size) D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch ) js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act) #---------------get_gail_reward------------------------------ rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel() buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf for path_slice in buf.slicelist[:-1]: ep_ret_gail = rew_gail[path_slice].sum() ep_ret_sum = buf.rew_buf[path_slice].sum() logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail) buf.culculate_adv_buf() # -------------Perform PPO update!-------------------- update() logger.store(JS=js_d) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('JS', average_only=True) #logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def ppo(env_fn, ref_func=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=500, epochs=10000, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=500, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) t_a_ph = core.placeholder_from_space(env.action_space) ret_ph = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, t_a_ph, ret_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) print("---------------", local_steps_per_epoch) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # dagger objectives pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs) # Training for i in range(train_pi_iters): sess.run(train_pi, feed_dict=inputs) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(1, epochs + 1, 1): for t in range(local_steps_per_epoch): a_s, v_t, logp_t = sess.run( get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)}) a = a_s[0] ref_a = call_mpc(env, ref_func) if (epoch < 100): a = ref_a # save and log buf.store(o, a, ref_a, r) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: np.array(o).reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 maxRev = float("-inf") #negative infinity in the beginning #maxRevActionSeq=[] maxRevTSTT = 0 maxRevRevenue = 0 maxRevThroughput = 0 maxRevJAH = 0 maxRevRemVeh = 0 maxRevJAH2 = 0 maxRevRMSE_MLvio = 0 maxRevPerTimeVio = 0 maxRevHOTDensity = pd.DataFrame() maxRevGPDensity = pd.DataFrame() maxtdJAHMax = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu numpyFromA = np.array(a[0]) numpyFromA = ((numpyFromA + 1.0) * (env.state.tollMax - env.state.tollMin) / 2.0) + env.state.tollMin a[0] = np.ndarray.tolist(numpyFromA) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) #get other stats and store them too otherStats = env.getAllOtherStats() if np.any(np.isnan(np.array(otherStats))): sys.exit("Nan found in statistics! Error") logger.store(EpTSTT=otherStats[0], EpRevenue=otherStats[1], EpThroughput=otherStats[2], EpJAH=otherStats[3], EpRemVeh=otherStats[4], EpJAH2=otherStats[5], EpMLViolRMSE=otherStats[6], EpPerTimeVio=otherStats[7], EptdJAHMax=otherStats[8]) #determine max rev profile if ep_ret > maxRev: maxRev = ep_ret maxRevActionSeq = env.state.tollProfile maxRevTSTT = otherStats[0] maxRevRevenue = otherStats[1] maxRevThroughput = otherStats[2] maxRevJAH = otherStats[3] maxRevRemVeh = otherStats[4] maxRevJAH2 = otherStats[5] maxRevRMSE_MLvio = otherStats[6] maxRevPerTimeVio = otherStats[7] maxRevHOTDensity = env.getHOTDensityData() maxRevGPDensity = env.getGPDensityData() maxtdJAHMax = otherStats[8] o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpTSTT', average_only=True) logger.log_tabular('EpRevenue', average_only=True) logger.log_tabular('EpThroughput', average_only=True) logger.log_tabular('EpJAH', average_only=True) logger.log_tabular('EpRemVeh', average_only=True) logger.log_tabular('EpJAH2', average_only=True) logger.log_tabular('EpMLViolRMSE', average_only=True) logger.log_tabular('EpPerTimeVio', average_only=True) logger.log_tabular('EptdJAHMax', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Max cumulative reward obtained= %f " % maxRev) print( "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f" % (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax)) outputVector = [ maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax ] #print("\n===Max rev action sequence is\n",maxRevActionSeq) exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector) exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
def pg_linesearch(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, backtrack_coeff=0.8, delta=0.01, backtrack_iters=1000, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = trpo_core.get_vars('pi') gradient = trpo_core.flat_grad(pi_loss, pi_params) #v_ph, hvp = trpo_core.hessian_vector_product(d_kl, pi_params) v_ph = tf.placeholder(tf.float32, shape=gradient.shape) ##TODO: more analysis on damping Coeff #if damping_coeff > 0: #hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = trpo_core.flat_concat(pi_params) set_pi_params = trpo_core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ ##TODO: Next Step is to try the hessian x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) cg_iters = 20 for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} #TODO: Next step is to calculate the hessian using safe distance #Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old, ent = sess.run( [gradient, pi_loss, v_loss, approx_ent], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) #x = cg(Hx, g) #x = optimize.fmin_cg(pi_l_old, x0, fprime=g) x = g old_params = sess.run(get_pi_params) old_penalty = env.penalty(env.s) alpha = np.sqrt(2 * delta / (np.dot(x, g) + EPS)) # backtracking line search, hard constraint check on env penalty for j in range(backtrack_iters): step = backtrack_coeff**j sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) pi_l_new = sess.run([pi_loss], feed_dict=inputs) penalty = env.penalty(env.s) #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty)) if penalty == 0 or penalty < old_penalty: #if pi_l_new <= pi_l_old: logger.log('Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) logger.store(penalty=penalty, old_penalty=old_penalty) # Policy gradient step #sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update #pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict={v_ph: old_params - alpha * x * step}) logger.store(LossPi=pi_l_old, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) logger.log_tabular('penalty', average_only=True) logger.log_tabular('old_penalty', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, explorer=None, eps=.03, pretrain_epochs=0): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_epochs = epochs + pretrain_epochs # Main loop: collect experience in env and update/log each epoch for epoch in range(total_epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # explore if you are in a pretrain epoch or if eps-greedy pre = epoch < pretrain_epochs during = random.random() < eps if pre or during: if explorer is None: raise ValueError('Trying to explore but explorer is None') state = env.env.state_vector() a = explorer.sample_action(state) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo(BASE_DIR, expert_density, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), steps_per_epoch=1000, epochs=10, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=50, train_v_iters=50, lam=0.97, max_ep_len=1000, target_kl=0.01, data_n=10): data = {} # ALL THE DATA logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # update rule def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) policy_distr = Gaussian_Density() policy = lambda s: np.random.uniform( -2.0, 2.0, size=env.action_space.shape) # random policy policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() data[0] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } dist_rewards = [] # repeat REIL for given number of rounds for i in range(args.rounds): message = "\nRound {} out of {}\n".format(i + 1, args.rounds) reward = lambda s: expert_density(s) / (density(s) + args.eps) dist_rewards.append(reward) start_time = time.time() o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # custom reward # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, old_r, d, _ = env.step(a[0]) r = reward(o) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print( 'Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = old_r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) last_val = reward(o) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 r = reward(o) # store model! if (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print(message) policy = lambda state: sess.run( get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0] data[i] = { 'pol_s': policy_distr.num_samples, 'pol_t': policy_distr.num_trajects } data[i]['rewards'] = evaluate_reward(env, policy, data_n) if i != args.rounds - 1: policy_distr.train(env, policy, args.trajects, args.distr_gamma, args.iter_length) density = policy_distr.density() return data, dist_rewards
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=33, steps_per_epoch=4000, epochs=50, gamma=0.998, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=60, train_v_iters=60, lam=0.95, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ ## Logger setup logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) ## Random seed setting seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) ## Environment instantiation env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape #Policies vector (only one for this project) policies = [] #TensorFlow session sess = tf.Session() #Build policy anc value networks MAP = MAPolicy(scope='policy_0', ob_space=env.observation_space, ac_space=env.action_space, network_spec=pi_specs, normalize=True, v_network_spec=v_specs) policies = [MAP] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Create aux placeholders for the computation graph adv_ph, ret_ph, logp_old_ph = core.placeholders(1, 1, 1) # Get main placeholders for the computation graph map_phs_dict = MAP.phs map_phs = [v for k, v in map_phs_dict.items()] for k, v in map_phs_dict.items(): if v.name == None: v.name = k # Append aux and main placeholders # Need placeholders in *this* order later (to zip with data from buffer) new_phs = [adv_ph, ret_ph, logp_old_ph] all_phs = np.append(map_phs, new_phs) # Intantiate Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['policy_net', 'vpred_net']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(MAP.taken_action_logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # PPO-clip limits pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # Policy loss function v_loss = tf.reduce_mean( (ret_ph - MAP.scaled_value_tensor)**2) # Value loss function # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - MAP.taken_action_logp ) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -MAP.taken_action_logp ) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or( ratio > (1 + clip_ratio), ratio < (1 - clip_ratio) ) # a logical value which states whether there was clipping clipfrac = tf.reduce_mean(tf.cast( clipped, tf.float32)) # a measure of clipping for posterior analysis # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize( pi_loss) #Policy network optimizer train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize( v_loss) #Value network optimizer #initialize TensorFlow variabels sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Set up logger variables to be saved (it is necessary to save everything that is # input/output to the networks so that the policy can be played afterwards during testing) out_act_dict = MAP.sampled_action out_state_dict = MAP.state_out logger_outputs = {**out_act_dict, **out_state_dict} for k, v in logger_outputs.items(): if 'lstm' in k: logger_outputs[k + '_out'] = logger_outputs.pop(k) logger_inputs = map_phs_dict logger.setup_tf_saver(sess, inputs=logger_inputs, outputs=logger_outputs) # ======================================================================== # # ===================== Auxiliary Training Functions ===================== # # ======================================================================== # # Compute metrics for analysis during and after training def compute_metrics(extra_dict={}): loss_outs = { 'pi_loss': pi_loss, 'v_loss': v_loss, 'approx_ent': approx_ent, 'approx_kl': approx_kl, 'approx_cf': clipfrac, 'taken_action_logp': MAP.taken_action_logp, 'ratio': ratio, 'min_adv': min_adv } out_loss = policies[0].sess_run(buf.obs_buf, sess_act=sess, extra_feed_dict=extra_dict, other_outputs=loss_outs, replace=True) return out_loss['pi_loss'], out_loss['v_loss'], out_loss[ 'approx_ent'], out_loss['approx_kl'], out_loss['approx_cf'] # ======================================================================= # # Run session on policy and value optimizers for training their respective networks def train(net, extra_dict={}): if net == 'pi': train_outs = {'train_pi': train_pi, 'approx_kl': approx_kl} elif net == 'v': train_outs = {'train_v': train_v} else: print("Error: Network not defined") return out_train = policies[0].sess_run(buf.obs_buf, sess_act=sess, extra_feed_dict=extra_dict, other_outputs=train_outs, replace=True) if net == 'pi': return out_train['approx_kl'] # ======================================================================= # # Perform training procedure def update(): print("======= update!") #get aux data from the buffer and match it with its respective placeholders buf_data = buf.get(aux_vars_only=True) aux_inputs = {k: v for k, v in zip(new_phs, buf_data)} #for the training, the actions taken during the experience loop are also inputs to the network extra_dict = {k: v for k, v in buf.act_buf.items() if k is not 'vpred'} for k, v in extra_dict.items(): if k == 'action_movement': extra_dict[k] = np.expand_dims(v, 1) #actions and aux variables from the buffer are joined and passed to compute_metrics (observations are joined within the functions) extra_dict.update(aux_inputs) pi_l_old, v_l_old, ent, kl, cf = compute_metrics(extra_dict) # Policy training loop for i in range(train_pi_iters): if i % 10 == 0: print("training pi iter ", i) kl = train('pi', extra_dict) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) print("") # Value training loop for j in range(train_v_iters): if j % 10 == 0: print("training v iter ", j) train('v', extra_dict) # Log changes from update with a new run on compute_metrics pi_l_new, v_l_new, ent, kl, cf = compute_metrics(extra_dict) #Store information logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) #Reset experience varibales o, ep_ret, ep_len = env.reset(), 0, 0 #Reset policy for policy in policies: policy.reset() print("======= update finished!") # ======================================================================= # start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # ======================================================================= # # ========================== Experience Loop ============================ # # ======================================================================= # # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): print("epoch: ", epoch) for t in range(local_steps_per_epoch): # Pass observations through networs and get action + predicted value if len(policies) == 1: #this project's case a, info = policies[0].sess_run(o, sess_act=sess) v_t = info['vpred'] logp_t = info['ac_logp'] else: o = splitobs(o, keepdims=False) ob_policy_idx = np.split(np.arange(len(o)), len(policies)) actions = [] for i, policy in enumerate(policies): inp = itemgetter(*ob_policy_idx[i])(o) inp = listdict2dictnp([inp] if ob_policy_idx[i].shape[0] == 1 else inp) ac, info = policy.act(inp) actions.append(ac) action = listdict2dictnp(actions, keepdims=True) # Take a step in the environment o2, r, d, env_info = env.step(a) ep_ret += r ep_len += 1 # If env.render is uncommented, the experience loop is displayed (visualized) # in real time (much slower, but excelent debugging) # env.render() # save experience in buffer and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 # Treat the end of a trajectory terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1) or env_info.get( 'discard_episode', False): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target if d: last_val = 0 else: _, info = policies[0].sess_run(o, sess_act=sess) last_val = info['vpred'] #Compute advantage estimates and rewards-to-go buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 for policy in policies: policy.reset() # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): print("Saved epoch: ", epoch) logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def impala(gym_or_pyco, env_fn, ac_kwargs=dict(), n=4, logger_kwargs=dict(), actor_critic=core.mlp_actor_critic, num_cpu=1, epochs=200, max_ep_len=300, steps_per_epoch=4000, gamma=0.99, seed=73,vf_lr=1e-3, pi_lr = 3e-4, entropy_cost = 0.00025, baseline_cost = .5, rho_bar = 1, c_bar = 1, train_pi_iters=80,train_v_iters=80, export_dir="/home/clement/Documents/spinningup_instadeep/data/cmd_impala/cmd_impala_s0/simple_save", tensorboard_path = '/home/clement/spinningup/tensorboard'): dict_continous_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink'] dict_discrete_gym = [] dict_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink'] env = env_fn() proc_id() seed += 10000 * 3 tf.set_random_seed(seed) np.random.seed(seed) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if gym_or_pyco == 'gym': None else: env = env() obs_dim = env.observation_space.shape if env.action_space == 4: act_dim = env.action_space try: act_dim = env.action_space.n except: act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph if gym_or_pyco == 'pyco': x_ph = tf.placeholder(tf.float32, shape=(None, obs_dim[0], obs_dim[1], 1)) else: x_ph = tf.placeholder(tf.float32, shape=(1, obs_dim[0], obs_dim[1], obs_dim[2])) # a_ph = core.placeholders_from_spaces(env.action_space) if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): a_ph = tf.placeholder(tf.uint8, shape=(1)) elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box): a_ph = tf.placeholder(tf.float32, shape=(env.action_space.shape[0])) else: a_ph = tf.placeholder(tf.int32, shape=(None)) if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy', action_space=env.action_space.n) elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box): pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, policy='baseline_gaussian_policy', action_space=env.action_space.shape[0]) else: pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy', action_space=env.action_space.n) adv_ph, pi_act_ph, logp_old_ph, v_trace_ph = core.placeholders(None, None, None, None) advantages = tf.stop_gradient(adv_ph) all_phs = [x_ph, a_ph, adv_ph, pi_act_ph] # every steps, get : action, value and logprob. get_action_ops = [pi, v, logp_pi] logits_op = [logits] # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # need to get rho_param from the v_trace function.. c_param = tf.minimum(tf.exp(logp - logp_old_ph) ,c_bar) rho_param = tf.minimum(tf.exp(logp - logp_old_ph) ,rho_bar) def compute_baseline_loss(v_trace_ph, v): # Loss for the baseline, summed over the time dimension. # Multiply by 0.5 to match the standard update rule: # d(loss) / d(baseline) = advantage return .5 * tf.reduce_sum(tf.square(v_trace_ph - v)) def compute_entropy_loss(logits): policy = tf.nn.softmax(logits) log_policy = tf.nn.log_softmax(logits) entropy_per_timestep = tf.reduce_sum(-policy * log_policy, axis=-1) return -tf.reduce_sum(entropy_per_timestep) #advantages = adv_buf[i] def compute_policy_gradient_loss(logits, advantages, a=all_phs[1]): #actions = tf.one_hot(a,depth=act_dim) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=a, logits=logits) advantages = tf.stop_gradient(advantages) policy_gradient_loss_per_timestep = cross_entropy * advantages return tf.reduce_sum(policy_gradient_loss_per_timestep) total_loss = compute_entropy_loss(logits) * entropy_cost + compute_baseline_loss(v_trace_ph, v) * baseline_cost + \ compute_policy_gradient_loss(logits, adv_ph, all_phs[1]) #pi_loss = tf.reduce_mean(adv_ph*rho_param) #v_loss = tf.reduce_mean((v_trace_ph - v) ** 2) def v_trace(obs_list, rews_list, act_list, logp_list, gamma, c_param, rho_param, v, obs_dim1, obs_dim2, last_obs_buf, sess): """Prend en entrée les trajectoires et les rewards associés, renvoie un dictionaire associé à des states : à un state x_s est associé un scalaire v_{x_s} les trajectoires seront une liste de trajectoires Args: obs_list: a list of different paths observations used for v_trace. rews_list: the list of the rewards lists from each of every paths used for v_trace. act_list: a list of the actions lists from each of every paths used for v_trace. logp_list: a vector of log probabilities log(p_old(a|s)) used for v_trace. gamma : hyperparam in v_trace and GAE c_param : a placeholder to be fed. rho_param : a placeholder to be fed v : a tf function for the value. Depends on x_ph and a_ph obs_dim1 : size of rows for board obs_dim2 : size of cols for board sess: contains the up to date policy of the graph from the learner at the time of computing v_trace. """ size_obs = len(obs_list) v_tr = np.zeros(size_obs+1) c_param = sess.run([c_param],feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list})[0] c_param[-1] = 1 rho_param = sess.run([rho_param], feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list}) #v_tr[-1] = sess.run([v],feed_dict={x_ph: np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1))}) + rews_list[-1] * rho_param[0][-1] v_tr[-1] = last_val_buf last_obs = np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1)) v_tr[-2] = sess.run([v],feed_dict={x_ph: last_obs})[0]+rho_param[0][-1]*(rews_list[-1] + gamma * sess.run([v],feed_dict={x_ph: last_obs_buf})[0]- sess.run([v],feed_dict={x_ph: last_obs})[0]) + gamma * c_param[-1] *(v_tr[-1] - sess.run([v],feed_dict={x_ph: last_obs_buf})[0] ) for i in range(size_obs-1): obs_t_1 = np.reshape(obs_list[size_obs-2-i], (1, obs_dim1, obs_dim2, 1)) obs_t = np.reshape(obs_list[size_obs-i-1],(1,obs_dim1, obs_dim2, 1)) v_tr[size_obs-2-i] = sess.run([v],feed_dict={x_ph: obs_t_1})[0]+rho_param[0][size_obs-2-i]*(rews_list[size_obs-2-i] + gamma * sess.run([v], feed_dict={x_ph: obs_t})[0]- sess.run([v],feed_dict={x_ph: obs_t_1})[0]) + gamma * c_param[size_obs-2-i] *(v_tr[size_obs-i-1] - sess.run([v], feed_dict={x_ph: obs_t})[0] ) return v_tr # with adv_ph the advantage with v_trace. On the whole thing?.. with tf.name_scope('pi_loss'): #core.variable_summaries(pi_loss) core.variable_summaries(total_loss) # Optimizers #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) #train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(total_loss) total_env_frames=1e6 momentum=0. epsilon=.1 decay=.99 #tf.get_variable( # 'num_environment_frames', # initializer=tf.zeros_initializer(), # shape=[], # dtype=tf.float32, # trainable=False, # collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) #num_env_frames = tf.train.get_global_step() #learning_rate = tf.train.polynomial_decay(pi_lr, num_env_frames, # total_env_frames, 0) global_step = 100 starter_learning_rate = 3e-4 end_learning_rate=3e-5 decay_steps=5e2 learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, 0) optimizer = tf.train.RMSPropOptimizer(learning_rate, decay, momentum, epsilon) train_pi = optimizer.minimize(total_loss) sess = tf.Session() merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(tensorboard_path + '/test') sess.run(tf.global_variables_initializer()) sess.run(sync_all_params()) # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi':pi, 'v': v}) def update(adv_buf, obs_list, act_list, logp_list): #pi_l_old, v_l_old = sess.run([pi_loss, v_loss],feed_dict={x_ph # # : obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]}) for i in range(n): for _ in range(train_pi_iters): sess.run(train_pi, feed_dict ={x_ph: obs_list[i], a_ph: act_list[i], logp_old_ph: logp_list[i], adv_ph: adv_buf[i], v_trace_ph: v_trace_list[i][:-1]}) #for _ in range(train_v_iters): # sess.run(train_v,feed_dict={x_ph: obs_list[i], a_ph: act_list[i], v_trace_ph: v_trace_list[i][:-1]}) #pi_l_new, v_l_new = sess.run([pi_loss, v_loss],feed_dict={x_ph: obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]}) #logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new-pi_l_old), DeltaLossV=(v_l_new - v_l_old)) saver = tf.train.Saver() save_path = saver.save(sess,export_dir) for epoch in range(epochs): # Begins collecting trajectories and computing v_traces, adv. obs_list = [] rew_list = [] act_list = [] val_list = [] logp_list = [] v_trace_list = [] adv_buf = [] actors = [Actor(x_ph, a_ph, np.random.random_integers(0, high=39239, size=1)[0]) for i in range(n)] ep_len = [] last_rew_list = [] for i in range(n): actors[i].load_last_weights(export_dir) obs_buf, act_buf, rew_buf, val_buf, logp_buf, last_rew_buf, last_val_buf, last_obs_buf = actors[i].get_episode(env,get_action_ops,gym_or_pyco,obs_dim) obs_buf = np.reshape(obs_buf, (np.shape(obs_buf)[0], obs_dim[0], obs_dim[1], 1)) ep_len.append(len(obs_buf)) last_rew_list = np.append(last_rew_list, last_rew_buf) logp_buf = np.reshape(logp_buf, (np.shape(logp_buf)[0])) obs_list.append(obs_buf) rew_list.append(rew_buf) act_list.append(act_buf) val_list.append(val_buf) logp_list.append(logp_buf) v_trace_list.append(v_trace(obs_list[i], rew_list[i], act_list[i], logp_list[i], gamma, c_param, rho_param, v, obs_dim[0], obs_dim[1], last_obs_buf, sess)) rews = np.append(rew_list[i], last_rew_buf) vals = np.append(val_list[i], last_val_buf) adv = rews[:-1] + gamma * v_trace_list[i][1:] - vals[:-1] # normalization of adv: adv_mean, adv_std = mpi_statistics_scalar(adv) adv = (adv - adv_mean) / (adv_std + 1e-5) adv_buf.append(adv) update(adv_buf, obs_list, act_list, logp_list) saver = tf.train.Saver() save_path = saver.save(sess, export_dir) EpRet = [] for k in range(n): EpRet = np.append(EpRet,sum(rew_list[k])) EpRet[-1] = EpRet[-1]+last_rew_list[k] logger.store(EpRet=EpRet) logger.store(EpLen=ep_len) logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', with_min_and_max=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states.策略给出的动作 ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``.策略给出的x状态下的a动作概率 ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``.动作被采样的概率 ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!)状态x下的V值 =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. AC框架参数 seed (int): Seed for random number generators.#随机种子数0 steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch.每轮迭代次数4000 epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform.轮次50 gamma (float): Discount factor. (Always between 0 and 1.)折扣因子0.99 clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. 剪切比率0.2 pi_lr (float): Learning rate for policy optimizer.策略学习率3e-4 vf_lr (float): Learning rate for value function optimizer.评价网络学习率1e-3 train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.)策略最大梯度下降步数80 train_v_iters (int): Number of gradient descent steps to take on value function per epoch.评价最大梯度下降步数80 lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) TD(lambda)中的lambda =0.97 max_ep_len (int): Maximum length of trajectory / episode / rollout. 每次最长步长 1000 target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.)提前停车用,目标kl 0.01 logger_kwargs (dict): Keyword args for EpochLogger.日志参数 save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function.模型保存频率每10轮 """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() #创建环境 obs_dim = env.observation_space.shape #读取环境维度 act_dim = env.action_space.shape #动作维度 # Share information about action space with policy architecture 策略框架下共享动作有关信息 ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) #输入环境和动作,输出策略相关信息 # Need all placeholders in *this* order later (to zip with data from buffer)之后需要的数据:环境,动作,优势函数、奖励和上一步的策略 all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob每步得到的信息:策略、V值和动作概率 get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #经验池 # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) #记录变量 logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) #日志输出策略和评价的变量数 # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) 比率=pi(new)/pi(old) min_adv = tf.where( adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) #tfwhere条件adv_ph,>0,表示优势增加,选(1+clip_ratio)*adv_ph, #<0,表示优势减少,选(1-clip_ratio)*adv_ph。 pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) #策略loss=比率adv与min_adv的最小者,限制策略偏移 v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp ) # a sample estimate for KL-divergence, easy to compute KL散度的样本估计,易于计算 approx_ent = tf.reduce_mean( -logp ) # a sample estimate for entropy, also easy to compute 熵的样本估计,易于计算 clipped = tf.logical_or( ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) # 逻辑或运算,判定需要剪切,clipped = true/false clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) #clipped转换为浮点数 # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } # zip([x_ph, a_ph, adv_ph, ret_ph, logp_old_ph], #[self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] ) pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # 输入上述数据,计算俩loss和熵 # Training for i in range(train_pi_iters): #策略迭代 _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) # 计算kl kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break #提前停止策略训练 logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # 训练评价网络 # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) #重新计算loss和kl,cf logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # 输出旧loss,kl,cf 和 delta loss start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 #重置 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): # 每一轮 for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) #根据环境给出动作 o2, r, d, _ = env.step(a[0]) #环境交互取得奖励 ep_ret += r #奖励累加 ep_len += 1 # 步长加一 # save and log buf.store(o, a, r, v_t, logp_t) #存储(环境、动作、奖励、V值、概率)经验池 logger.store(VVals=v_t) # Update obs (critical!)更新环境 o = o2 terminal = d or (ep_len == max_ep_len) #结束了或者到达最大步数了 if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) #轨迹被epoch在ep_len步切断 # if trajectory didn't reach terminal state, bootstrap value target 如果轨迹没有达到终端状态,引导值目标 last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, env_fn, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, save_freq=10, exp_name='', test_agent=False, load=False): # seed self.seed = seed self.seed += 10000 * proc_id() tf.set_random_seed(self.seed) np.random.seed(self.seed) random.seed(self.seed) # Hyper-parameter self.env = env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape self.gamma, self.lam = gamma, lam self.steps_per_epoch, self.epochs = steps_per_epoch, epochs self.pi_lr, self.vf_lr = pi_lr, vf_lr self.clip_ratio, self.target_kl = clip_ratio, target_kl self.train_pi_iters, self.train_v_iters = train_pi_iters, train_v_iters self.exp_name, self.save_freq, self.max_ep_len = exp_name, save_freq, max_ep_len if test_agent: from visdom import Visdom self.viz = Visdom() assert self.viz.check_connection() self.win = self.viz.matplot(plt) # Experience buffer self.buf = PPOBuffer(self.obs_dim, self.act_dim, self.steps_per_epoch, self.gamma, self.lam) self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(config=config, graph=self.graph) self.__make_model() self.sess.run(tf.global_variables_initializer()) # Sync params across processes self.sess.run(sync_all_params()) Count_Variables() print('Trainable_variables:') for v in tf.compat.v1.trainable_variables(): print('{}\t {}'.format(v.name, str(v.shape))) var_list = tf.global_variables() self.saver = tf.compat.v1.train.Saver(var_list=var_list, max_to_keep=1) # summary self.writer = tf.compat.v1.summary.FileWriter("logs/" + exp_name) if load: self.load() self.sess.run(sync_all_params()) self.ep_ret_ph = tf.placeholder(tf.float32, shape=(), name="ep_ret_ph") self.ep_Entropy_ph = tf.placeholder(tf.float32, shape=(), name="Entropy") self.clipfrac_ph = tf.placeholder(tf.float32, shape=(), name="clipfrac") self.ep_len_ph = tf.placeholder(tf.float32, shape=(), name="ep_len_ph") self.test_summary = tf.compat.v1.summary.merge([ tf.compat.v1.summary.scalar('EP_ret', self.ep_ret_ph, family='test'), tf.compat.v1.summary.scalar('EP_len', self.ep_len_ph, family='test') ]) self.entropy_summary = tf.compat.v1.summary.merge([ tf.compat.v1.summary.scalar('Entropy', self.ep_Entropy_ph, family='test'), tf.compat.v1.summary.scalar('clipfrac', self.clipfrac_ph, family='test') ])
def ppo(env_fn, # by default, use the neural network mlp we define in core actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ "Args: env_fn: A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function with takes in placeholder symbols for state, ``x_ph``, and action ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given states. ``logp`` (batch,) | Gives log probability according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ======================================" -OpenAI Okay, quick interruption to OpenAI documentation here. actor_critic is the function which interfaces with tensorflow. It takes in ``x_ph`` (x placeholder), ie. a representation of the current state, and ``a_ph``, a representation of the some actions. (TODO: document *what* these actions are). actor_critic runs these inputs through the tensorflow graph and returns several pieces of information that are relevant to PPO; these are described above. Back to OpenAI: " ac_kwargs (dict): Any kwargs appropriate for actor_critic function you provided to PPO. seed (int): Seed for random number generators. setps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value funciton per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1). max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function." - OpenAI """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # modify the seed based on the process so if # we run this in multiple processes # simultaneously we don't do the # exact same thing seed += 10000 * proc_id() # set up our random stuff with this seed tf.set_random_seed(seed) np.random.seed(seed) # create the environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # tell the policy (implemented in actor_critic function) what the action space is ac_kwargs['action_space'] = env.action_space # "Inputs to computation graph" -OpenAI # create tensorflow placeholders for observations (x_ph), actions (a_ph), # advantages (adv_ph), returns (ret_ph), log probabilities # in the current state of the policy (logp_old_ph) # (old since this is used compared to the newer version of the policy # we are creating in the optimization step, comparing to this "old" version) x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # "Main outputs from computation graph" -OpenAI # essentially here we fill in the tensorflow graph so we can compute # the pi, logp, logp_pi, and v tensors based on the # x_ph and a_ph we created above pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # "Need all placeholders in *this* order later (to zip with data from buffer)" -OpenAI all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # "Every step, get: action, value, and logprob" -OpenAI # we later feed this list into tf.session.run() # to tell it to compute the value of pi, v, logp_pi # using the tensorflow graph we have created get_action_ops = [pi, v, logp_pi] # Experience buffer # number of steps per epoch per process local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count the number of parameters we are gonna be training, # both for the policy and for the value function var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # PPO objectives # ratio is the ratio of two probabilities: # pi(a|s) / pi_old(a|s) # where pi(a|s) is the probability of performing action a # given state s GIVEN THE POLICY WHOSE PARAMETERS WE ARE CHANGING # DURING THE OPTIMIZATION STEP # and pi_old(a|s) is the probability of the policy, # with fixed mlp parameters after the last update, # performing a given state s # we essentially use math to find the gradient of pi(a|s) with respect # to the parameters of the mlp, and this is the core of how we calculate # the gradient of the objective function for gradient descent ratio = tf.exp(logp - logp_old_ph) # "pi(a|s) / pi_old(a|s)"-OpenAI # this min_adv, along with the tf.minimum call in the next line of code, # implement the PPO-clip functionality # NOTE: calling this `min_adv` is a bit confusing; if advantage is negative # this is the min value we allow the gradient descent to consider as the advantage; # but it is the MAX value if advantage is positive. min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # create the functions whose gradients we wish to use for gradient descent # during optimization # for our policy optimization, it is the PPO objective; # for the value function it is simply an error-squared # note that reduce_mean just calculates the mean of the values in the tensor; # ie. this gives the expected value of the loss given the experimental values we have pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # "a sample estimate for KL-divergence, easy to compute" -OpenAI approx_ent = tf.reduce_mean(-logp) # "a sample estimate for entropy, also easy to compute" -OpenAI clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # what fraction of advantages are clipped # Optimizers # These use gradient descent with the gradient of the objective # functions we defined above to improve parameters for pi and v train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # initialize the tensorflow computation graph's parameters # with values sess = tf.Session() sess.run(tf.global_variables_initializer()) # "Sync params across processes" -OpenAI sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): # create a dictionary of values, which specify to tensorflow what # to input for the placeholders: tensors containing the data from # the trajectory we have stored in buf inputs = {k:v for k, v in zip(all_phs, buf.get())} # calculate these for logging later pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): # run a training step for the policy, and estimate the kl-divergence # (ie. how much the policy changed) on this step _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) # if the kl divergence is too high, stop training on this step # TODO: understand better why it is important to do this if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.'%i) break logger.store(StopIter=i) # train our value function mlp for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # "Log changes from update" -OpenAI # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop # after having calculated the same thing as pi_l_new the previous run through the loop! # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV # instead of pi_l_new and v_l_new? pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() # initialize the variables we use while training # o = observation (env.reset() returns initial observation) # r = reward = (starts as 0) # d = done? (whether current episode in env is over) # ep_ret = episode return # ep_len = length of episode so far o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # "Main loop: collect experience in env and update/log each epoch" for epoch in range(epochs): for t in range(local_steps_per_epoch): # run the computation of the action, value function, and probability of the action # using the most recent observation in the x_ph slot a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # take the action we computed and advance the environment o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch - 1): if not terminal: print('Warning: trajectory cut off by epoch at %d steps'%ep_len) # "if trajectory didn't reach terminal state, bootstrap value target" -OpenAI # in other words, if the we are stopping this trajectory due to a termination # signal from the env, last_val = the reward from the last step, r # otherwise we stopped because we reached the max episode length or max local_steps_per_epoch, # in which ase we set last_val = estimate of the value of current state based on v function # we are training last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) # "only store EpRet / EpLen if trajectory finished" -OpenAI if terminal: logger.store(EpRet=ep_ret, EpLen=ep_len) # reset our training variables and the training environment o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # every save_freq epochs, # save the state of the environment # also save the current state of our value function model # and policy # these are automatically saved by the save_state function # since we have already called logger.setup_tf_saver if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # "Perform PPO update!" update() # "Log info about epoch" logger.log_tabular('Epoch', epoch) try: logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) except: pass logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, custom_h=None, do_checkpoint_eval=False, env_name=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # create logger for tensorboard tb_logdir = "{}/tb_logs/".format(logger.output_dir) tb_logger = Logger(log_dir=tb_logdir) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space if custom_h is not None: hidden_layers_str_list = custom_h.split('-') hidden_layers_int_list = [int(h) for h in hidden_layers_str_list] ac_kwargs['hidden_sizes'] = hidden_layers_int_list # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the # whole GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # log tf graph tf.summary.FileWriter(tb_logdir, sess.graph) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) # for saving the best models and performances during train and evaluate best_eval_AverageEpRet = 0.0 best_eval_StdEpRet = 1.0e20 def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save. logger.save_state({'env': env}, epoch) # Evaluate and save best model if do_checkpoint_eval and epoch > 0: # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999. # Doing this way, I can use test_policy and plot directly to test the best models. # saved best models includes: # 1) a copy of the env # 2) the best rl model with parameters # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch # note that 1) and 2) are spinningup defaults, and 3) is a custom save best_eval_AverageEpRet, best_eval_StdEpRet = eval_and_save_best_model( best_eval_AverageEpRet, best_eval_StdEpRet, # a new logger is created and passed in so that the new logger can leverage the directory # structure without messing up the logger in the training loop eval_logger=EpochLogger( **dict(exp_name=logger_kwargs['exp_name'], output_dir=os.path.join(logger.output_dir, "simple_save999999"))), train_logger=logger, tb_logger=tb_logger, epoch=epoch, # the env_name is passed in so that to create an env when and where it is needed. This is to # logx.save_state() error where an env pointer cannot be pickled env_name=env_name, get_action=lambda x: sess.run( pi, feed_dict={x_ph: x[None, :]})[0]) # Perform VPG update! update() # # # Log into tensorboard log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False) tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch) tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, env, num_agents, config): self.num_agent = num_agents self.config = config self.env = env self.observation_space = env.observation_space self.action_space = env.action_space self.is_rnn = config["network"] == "rnn" self.is_cnn = config["network"] == "cnn" config["num_agent"] = num_agents if isinstance(self.observation_space, gym.spaces.Box): if config["share_policy"]: pg_agent = PGAgent( "shared", self.observation_space, gym.spaces.Discrete(self.action_space.shape[0] / self.num_agent), config, ) self.agents = [pg_agent for i in range(num_agents)] else: self.agents = [ PGAgent( i, self.observation_space, gym.spaces.Discrete(self.action_space.shape[0] / self.num_agent), config, ) for i in range(num_agents) ] else: if config["share_policy"]: pg_agent = PGAgent( "shared", self.observation_space[0], self.action_space[0], config, ) self.agents = [pg_agent for i in range(num_agents)] else: self.agents = [ PGAgent(i, self.observation_space[i], self.action_space[i], config) for i in range(num_agents) ] self.local_steps_per_epoch = int(config["steps_per_epoch"] / num_procs()) if type(self.action_space[0]) is gym.spaces.Discrete: action_dim = 1 else: action_dim = self.action_space[0].shape[0] if type(self.observation_space[0]) is gym.spaces.Discrete: obs_dim = self.observation_space[0].n else: if self.is_cnn: obs_dim = self.observation_space[0].shape else: obs_dim = self.observation_space[0].shape[0] self.buf = new_buffer( num_agents, obs_dim, action_dim, size=self.local_steps_per_epoch, type=config["algo"], is_rnn=self.is_rnn, is_cnn=self.is_cnn, rnn_length=config["rnn_length"], ) # init logger self.logger = Logger(self.config) # init session and sync params self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.sess.run(sync_all_params()) tf.io.write_graph( graph_or_graph_def=self.sess.graph_def, logdir=os.path.join(self.config["output_dir"]), name="model", ) # save model self.savers = [] with tf.device('/cpu:0'): for i in range(num_agents): vars_list = tf.compat.v1.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="Agent_{}".format(i)) self.savers.append(tf.train.Saver(vars_list, max_to_keep=100)) if self.config["save_path"] is None: self.save_path = "/tmp/agents.pickle" else: self.save_path = self.config["save_path"] self.save_path += 's/' if not os.path.exists(self.save_path): os.makedirs(self.save_path) # load model if self.config["load_model"]: self.restore(self.config["restore_path"], [0, 1])
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dims = env.action_space #[ choice.shape for choice in env.action_space.values() ] #act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholder(None), core.placeholder( None), {} for k in env.action_space: logp_old_ph[k] = core.placeholder(None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dims, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio, min_adv, pi_loss = {}, {}, {} for k in env.action_space: ratio[k] = tf.exp(logp[k] - logp_old_ph[k]) # pi(a|s) / pi_old(a|s) min_adv[k] = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss[k] = -tf.reduce_mean(tf.minimum(ratio[k] * adv_ph, min_adv[k])) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl, approx_ent, clipped, clipfrac = {}, {}, {}, {} for k in env.action_space: approx_kl[k] = tf.reduce_mean( logp_old_ph[k] - logp[k]) # a sample estimate for KL-divergence, easy to compute approx_ent[k] = tf.reduce_mean( -logp[k]) # a sample estimate for entropy, also easy to compute clipped[k] = tf.logical_or(ratio[k] > (1 + clip_ratio), ratio[k] < (1 - clip_ratio)) clipfrac[k] = tf.reduce_mean(tf.cast(clipped[k], tf.float32)) pi_loss_sum = tf.reduce_sum(list(pi_loss.values())) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss_sum) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving save_outputs = {'v': v} for k in env.action_space: save_outputs['pi_' + k] = pi[k] logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs=save_outputs) def update(): inputs = {} for k, v in zip(all_phs, buf.get()): if type(k) is not dict: inputs[k] = v else: for k_, v_ in zip(k.values(), v.values()): inputs[k_] = v_ pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) for k in kl: kl[k] = mpi_avg(kl[k]) if max(list(kl.values())) > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs) sum_dict = lambda x: x if type(x) is not dict else np.sum( list(x.values())) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=sum_dict(kl), Entropy=sum_dict(ent), ClipFrac=sum_dict(cf), DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(**a) env.render() #force_realtime=True) ep_ret += r #print ("frame_return: %.4f sofar_EpRet: %.4f" % (r, ep_ret)) ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) print("EpRet:", ep_ret) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def main(env_fn, traj_dir, actor_critic=core.mlp_actor_critic, bc_itr=1000, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=4000, target_kl=0.01, save_freq=100, r_env_ratio=0, reward_type='negative', trj_num=30, buf_size=None, si_update_ratio=0.02, js_threshold_ratio=0.5, js_smooth=5): """ test behavior cloning """ seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.loadtxt(traj_dir + '/observations.csv', delimiter=',') e_act = np.loadtxt(traj_dir + '/actions.csv', delimiter=',') #Demo treajectory Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=bc_itr) # Sync params across processes start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 # Setup model saving for epoch in range(1000000): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o, r, d, _ = env.step(a[0]) env.render() time.sleep(1e-3) ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal: print('EpRet{},EpLen{}'.format(ep_ret_task, ep_len)) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0
def sigail(env_fn, traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(), d_hidden_size=64, seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000, beta=1e-4, target_kl=0.01, logger_kwargs=dict(), save_freq=100, r_env_ratio=0, d_itr=20, reward_type='negative', trj_num=20, buf_size=1000, si_update_ratio=0.02, js_smooth=5, buf_update_type='random', pretrain_bc_itr=0): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape D = Discriminator(env, hidden_size=d_hidden_size, reward_type=reward_type) #!add Discriminator object D_js_m = JS_div_machine(env, hidden_size=d_hidden_size) e_obs = np.zeros((buf_size, obs_dim[0])) e_act = np.zeros((buf_size, act_dim[0])) Sibuffer = SIBuffer(obs_dim, act_dim, e_obs, e_act, trj_num=trj_num, max_size=buf_size, js_smooth_num=js_smooth) #!sibuf trj_full = False assert e_obs.shape[1:] == obs_dim # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, pi_std, entropy, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum( ratio * adv_ph, min_adv)) - beta * entropy #add entropy v_loss = tf.reduce_mean((ret_ph - v)**2) #ret_phには累積報酬のバッファが入る # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph) sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Sync params across processes # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get()) } #all_phsは各バッファーに対応するプレースホルダー辞書 pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training#ここも変える必要あり? おそらく変えなくて良い for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): #vの更新 sess.run(train_v, feed_dict=inputs) # Log changes from update(新しいロスの計算) pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs) logger.store( LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=std_ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), #更新での改善量 DeltaLossV=(v_l_new - v_l_old), Std=std) start_time = time.time() o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0 if pretrain_bc_itr > 0: BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=pretrain_bc_itr) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ''' if t <150: env.render() time.sleep(0.03) ''' ep_ret_task += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): ''' if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) ''' #!add discriminator train '''#終端も加えるならアリッチャあり o_reshape = o.reshape(core.combined_shape(1,obs_dim)) a_reshape = a.reshape(core.combined_shape(1,act_dim)) agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習 ''' agent_obs = buf.obs_buf[buf.path_slice()] agent_act = buf.act_buf[buf.path_slice()] #D.train(sess,e_obs,e_act ,agent_obs,agent_act) #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) if trj_full: gail_r = 1 else: gail_r = 0 rew_gail = gail_r * D.get_reward( sess, agent_obs, agent_act).ravel() #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる) ep_ret_gail += rew_gail.sum() #!before gail_ratio ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail rew_gail_head = rew_gail[:-1] last_val_gail = rew_gail[-1] buf.rew_buf[slice( buf.path_start_idx + 1, buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[ slice(buf.path_start_idx + 1, buf.ptr)] #!add GAIL reward 最後の報酬は含まれないため長さが1短い if d: # if trajectory didn't reach terminal state, bootstrap value target last_val = r_env_ratio * r + last_val_gail else: last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1) }) #v_last=...だったけどこれで良さげ buf.finish_path( last_val) #これの前にbuf.finish_add_r_vがなされていることを確認すべし if terminal: #only store trajectory to SIBUffer if trajectory finished if trj_full: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory else: Sibuffer.store( agent_obs, agent_act, sum_reward=ep_ret_task) #!store trajectory logger.store(EpRet=ep_ret_task, EpRet_Sum=ep_ret_sum, EpRet_Gail=ep_ret_gail, EpLen=ep_len) o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset( ), 0, False, 0, 0, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, epoch) # Perform PPO update! if not (trj_full): M_obs_buf = Sibuffer.get_obs_trj() trj_full = (M_obs_buf.shape[0] >= buf_size) if trj_full: #replaybufferがr_thresholdよりも大きいとき Sibuffer.update_main_buf(ratio_update=si_update_ratio, update_type=buf_update_type) M_obs_buf = Sibuffer.get_obs_trj() M_act_buf = Sibuffer.get_act_trj() d_batch_size = len(agent_obs) for _t in range(d_itr): e_obs_batch, e_act_batch = Sibuffer.get_random_batch( d_batch_size) D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act) D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs, e_act) #バッファとエキスパートの距離を見るためにtrain js_d = D.get_js_div(sess, Sibuffer.main_obs_buf, Sibuffer.main_act_buf, agent_obs, agent_act) js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs, e_act) else: js_d, js_d_m = 0.5, 0.5 update() Sibuffer.store_js(js_d) logger.store(JS=js_d, JS_M=js_d_m, JS_Ratio=Sibuffer.js_ratio_with_random) # Log info about epoch #if epoch%10 == 0:#logger print each 10 epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpRet_Sum', average_only=True) logger.log_tabular('EpRet_Gail', average_only=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Std', average_only=True) logger.log_tabular('buffer_r', Sibuffer.buffer_r_average) logger.log_tabular('JS', average_only=True) logger.log_tabular('JS_M', average_only=True) logger.log_tabular('JS_Ratio', average_only=True) logger.dump_tabular()
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10, custom_h=None, eval_episodes=50, do_checkpoint_eval=False, env_name=None, eval_temp=1.0, train_starting_temp=1.0, env_version=None, env_input=None, target_arcs=None): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # create logger for tensorboard tb_logdir = "{}/tb_logs/".format(logger.output_dir) tb_logger = Logger(log_dir=tb_logdir) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space if custom_h is not None: hidden_layers_str_list = custom_h.split('-') hidden_layers_int_list = [int(h) for h in hidden_layers_str_list] ac_kwargs['hidden_sizes'] = hidden_layers_int_list # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) temperature_ph = tf.placeholder(tf.float32, shape=(), name="init") # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, temperature_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, temperature_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the # whole GPU memory config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) # log tf graph tf.summary.FileWriter(tb_logdir, sess.graph) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'temperature': temperature_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log('Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, [] # initialize variables for keeping track of BEST eval performance best_eval_AverageEpRet = -0.05 # a negative value so that best model is saved at least once. best_eval_StdEpRet = 1.0e30 # save is used to only allow saving BEST models after half of training epochs save = True # below are used for early-stop. We early stop if # 1) a best model has been saved, and, # 2) 50 epochs have passed without a new save saved = False early_stop_count_started = False episode_count_after_saved = 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): current_temp = _get_current_temperature(epoch, epochs, train_starting_temp) for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1), temperature_ph: current_temp}) # save and log buf.store(o, a, r, v_t, logp_t, current_temp) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 if env_version >= 4 and env.action_is_dummy: # a is dummy action ep_dummy_action_count += 1 ep_dummy_steps_normalized.append(ep_len / env.allowed_steps) terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1), temperature_ph: current_temp}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) if env_version >= 4: logger.store(EpDummyCount=ep_dummy_action_count) logger.store(EpTotalArcs=env.adjacency_matrix.sum()) if len(ep_dummy_steps_normalized) > 0: ep_dummy_steps_normalized = np.asarray(ep_dummy_steps_normalized, dtype=np.float32).mean() logger.store(EpDummyStepsNormalized=ep_dummy_steps_normalized) o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, [] # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save. # logger.save_state({'env_name': env_name}, epoch) # # Save a new model every save_freq and at the last epoch. Only keep one copy - the current model # logger.save_state({'env_name': env_name}) # Evaluate and save best model if do_checkpoint_eval and epoch > 0: # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999. # Doing this way, I can use test_policy and plot directly to test the best models. # saved best models includes: # 1) a copy of the env_name # 2) the best rl model with parameters # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch # note that 1) and 2) are spinningup defaults, and 3) is a custom save best_eval_AverageEpRet, best_eval_StdEpRet, saved = eval_and_save_best_model( best_eval_AverageEpRet, best_eval_StdEpRet, # a new logger is created and passed in so that the new logger can leverage the directory # structure without messing up the logger in the training loop eval_logger=EpochLogger(**dict( exp_name=logger_kwargs['exp_name'], output_dir=os.path.join(logger.output_dir, "simple_save999999"))), train_logger=logger, tb_logger=tb_logger, epoch=epoch, # the env_name is passed in so that to create an env when and where it is needed. This is to # logx.save_state() error where an env pointer cannot be pickled env_name="F{}x{}T{}_SP{}_v{}".format(env.n_plant, env.n_product, env.target_arcs, env.n_sample, env_version) if env_version >= 3 else env_name, env_version=env_version, env_input=env_input, render=False, # change this to True if you want to visualize how arcs are added during evaluation target_arcs=env.target_arcs, get_action=lambda x: sess.run(pi, feed_dict={x_ph: x[None, :], temperature_ph: eval_temp})[0], # number of samples to draw when simulate demand n_sample=5000, num_episodes=eval_episodes, save=save, seed=seed ) # Perform PPO update! update() # # # Log into tensorboard log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True) log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="ClipFrac", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="StopIter", with_min_and_max=False) tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch) tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch) tb_logger.log_scalar(tag="epoch_temp", value=current_temp, step=epoch) if env_version >= 4: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyCount", with_min_and_max=False) log_key_to_tb(tb_logger, logger, epoch, key="EpTotalArcs", with_min_and_max=False) if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: log_key_to_tb(tb_logger, logger, epoch, key="EpDummyStepsNormalized", with_min_and_max=False) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('EpochTemp', current_temp) if env_version >= 4: logger.log_tabular('EpDummyCount', with_min_and_max=True) if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0: logger.log_tabular('EpDummyStepsNormalized', average_only=True) logger.log_tabular('EpTotalArcs', average_only=True) logger.dump_tabular() # check for early stop if saved: # start to count the episodes elapsed after a "saved" event early_stop_count_started = True # reset the count to 0 episode_count_after_saved = 0 else: # check whether we should count this episode, i.e., whether early_stop_count_started == True if early_stop_count_started: episode_count_after_saved += 1 if episode_count_after_saved > 60: logger.log('Early Stopped at epoch {}.'.format(epoch), color='cyan') break
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3, train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, algo='trpo'): """ Trust Region Policy Optimization (with support for Natural Policy Gradient) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: ============ ================ ======================================== Symbol Shape Description ============ ================ ======================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``info`` N/A | A dict of any intermediate quantities | (from calculating the policy or log | probabilities) which are needed for | analytically computing KL divergence. | (eg sufficient statistics of the | distributions) ``info_phs`` N/A | A dict of placeholders for old values | of the entries in ``info``. ``d_kl`` () | A symbol for computing the mean KL | divergence between the current policy | (``pi``) and the old policy (as | specified by the inputs to | ``info_phs``) over the batch of | states given in ``x_ph``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) ============ ================ ======================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TRPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) delta (float): KL-divergence limit for TRPO / NPG update. (Should be small for stability. Values like 0.01, 0.05.) vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. damping_coeff (float): Artifact for numerical stability, should be smallish. Adjusts Hessian-vector product calculation: .. math:: Hv \\rightarrow (\\alpha I + H)v where :math:`\\alpha` is the damping coefficient. Probably don't play with this hyperparameter. cg_iters (int): Number of iterations of conjugate gradient to perform. Increasing this will lead to a more accurate approximation to :math:`H^{-1} g`, and possibly slightly-improved performance, but at the cost of slowing things down. Also probably don't play with this hyperparameter. backtrack_iters (int): Maximum number of steps allowed in the backtracking line search. Since the line search usually doesn't backtrack, and usually only steps back once when it does, this hyperparameter doesn't often matter. backtrack_coeff (float): How far back to step during backtracking line search. (Always between 0 and 1, usually above 0.5.) lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. algo: Either 'trpo' or 'npg': this code supports both, since they are almost the same. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph, plus placeholders for old pdist (for KL) pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic( x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph ] + core.values_as_sorted_list(info_phs) # Every step, get: action, value, logprob, & info for pdist (for computing kl div) get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info) # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()} buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # TRPO losses ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) pi_loss = -tf.reduce_mean(ratio * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Optimizer for value function train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) # Symbols needed for CG solver pi_params = core.get_vars('pi') gradient = core.flat_grad(pi_loss, pi_params) v_ph, hvp = core.hessian_vector_product(d_kl, pi_params) if damping_coeff > 0: hvp += damping_coeff * v_ph # Symbols for getting and setting params get_pi_params = core.flat_concat(pi_params) set_pi_params = core.assign_params_from_flat(v_ph, pi_params) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def cg(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() r_dot_old = np.dot(r, r) for _ in range(cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + EPS) x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) p = r + (r_dot_new / r_dot_old) * p r_dot_old = r_dot_new return x def update(): # Prepare hessian func, gradient eval inputs = {k: v for k, v in zip(all_phs, buf.get())} Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x})) g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs) g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old) # Core calculations for TRPO or NPG x = cg(Hx, g) alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS)) old_params = sess.run(get_pi_params) def set_and_eval(step): sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step}) return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs)) if algo == 'npg': # npg has no backtracking or hard kl constraint enforcement kl, pi_l_new = set_and_eval(step=1.) elif algo == 'trpo': # trpo augments npg with backtracking line search, hard kl for j in range(backtrack_iters): kl, pi_l_new = set_and_eval(step=backtrack_coeff**j) if kl <= delta and pi_l_new <= pi_l_old: logger.log( 'Accepting new params at step %d of line search.' % j) logger.store(BacktrackIters=j) break if j == backtrack_iters - 1: logger.log('Line search failed! Keeping old params.') logger.store(BacktrackIters=j) kl, pi_l_new = set_and_eval(step=0.) # Value function updates for _ in range(train_v_iters): sess.run(train_vf, feed_dict=inputs) v_l_new = sess.run(v_loss, feed_dict=inputs) # Log changes from update logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[ 1], agent_outs[2], agent_outs[3:] o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t, info_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform TRPO or NPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('KL', average_only=True) if algo == 'trpo': logger.log_tabular('BacktrackIters', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ppo_pyco(gym_or_pyco, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=473, steps_per_epoch=200, epochs=500, gamma=0.99, clip_ratio=0.1, pi_lr=1e-2, vf_lr=5e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=350, target_kl=0.01, logger_kwargs=dict(), save_freq=10, tensorboard_path='/home/clement/spinningup/tensorboard'): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. tensorboard_path: The path to the saved graphs&scalars in tensorboard """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) #seed += 10000 * proc_id() seed = 2808 tf.set_random_seed(seed) np.random.seed(seed) dict_continous_gym = [ 'CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink' ] dict_discrete_gym = [] env = env_fn() dict_gym = [ 'CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink' ] # This code is specific for pycolab if gym_or_pyco == 'gym': None else: env = env() obs_dim = env.observation_space.shape if env.action_space == 4: act_dim = env.action_space try: act_dim = env.action_space.n except: act_dim = env.action_space.shape #act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) if gym_or_pyco == 'pyco': x_ph = tf.placeholder(tf.float32, shape=(None, obs_dim[0], obs_dim[1], 1)) else: x_ph = tf.placeholder(tf.float32, shape=(None, obs_dim[0], obs_dim[1], obs_dim[2])) # a_ph = core.placeholders_from_spaces(env.action_space) if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): a_ph = tf.placeholder(tf.uint8, shape=(None)) elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box): a_ph = tf.placeholder(tf.float32, shape=(env.action_space.shape[0])) else: a_ph = tf.placeholder(tf.uint8, shape=(None)) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, **ac_kwargs) # actor_critic with relational policy # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='relational_categorical_policy', action_space = env.action_space.n, **ac_kwargs) if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): pi, logp, logp_pi, v, logits = actor_critic( x_ph, a_ph, policy='baseline_categorical_policy', action_space=env.action_space.n) elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box): pi, logp, logp_pi, v = actor_critic( x_ph, a_ph, policy='baseline_gaussian_policy', action_space=env.action_space.shape[0]) else: pi, logp, logp_pi, v, logits = actor_critic( x_ph, a_ph, policy='baseline_categorical_policy', action_space=env.action_space.n) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(gym_or_pyco, obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # tensorboard test with tf.name_scope('pi_loss'): core.variable_summaries(pi_loss) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # tensorboard merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(tensorboard_path + '/train', sess.graph) test_writer = tf.summary.FileWriter(tensorboard_path + '/test') sess.run(tf.global_variables_initializer()) # saver.restore(sess, "/home/clement/Documents/spinningup_instadeep/trained_params/model.ckpt") #saver.restore(sess, "/home/clement/Documents/spinningup_instadeep/data/cmd_ppo_pyco/cmd_ppo_pyco_s0/simple_save") #tf.reset_default_graph() #export_dir = "/home/clement/Documents/spinningup_instadeep/data/cmd_ppo_pyco/cmd_ppo_pyco_s0/simple_save" #tf.saved_model.loader.load(sess, ["serve"],export_dir) #sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving #logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(epoch): # inputs = {k:v for k,v in zip(all_phs, buf.get())} if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): pi_l_old, v_l_old, ent = sess.run( [pi_loss, v_loss, approx_ent], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) if gym_or_pyco == 'gym' and isinstance(env.action_space, Box): pi_l_old, v_l_old, ent = sess.run( [pi_loss, v_loss, approx_ent], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) else: pi_l_old, v_l_old, ent = sess.run( [pi_loss, v_loss, approx_ent], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) summary = tf.Summary( value=[tf.Summary.Value(tag="loss", simple_value=pi_l_old)]) test_writer.add_summary(summary, epoch) # Training for i in range(train_pi_iters): if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): _, kl = sess.run( [train_pi, approx_kl], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break if gym_or_pyco == 'gym' and isinstance(env.action_space, Box): _, kl = sess.run( [train_pi, approx_kl], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break else: _, kl = sess.run( [train_pi, approx_kl], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): sess.run(train_v, feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) if gym_or_pyco == 'gym' and isinstance(env.action_space, Box): sess.run(train_v, feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) else: #sess.run(train_v, feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf, # ret_ph: buf.ret_buf}) sess.run(train_v, feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) # Log changes from update if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete): pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) if gym_or_pyco == 'gym' and isinstance(env.action_space, Box): pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) else: pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict={ logp_old_ph: buf.logp_buf, x_ph: buf.obs_buf, a_ph: buf.act_buf, adv_ph: buf.adv_buf, ret_ph: buf.ret_buf }) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if gym_or_pyco == 'gym': o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) else: o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): num_ep = 0 summary_ep = [] logits_debug = [] for t in range(local_steps_per_epoch): # a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.board.reshape(1, -1)}) a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o}) logits_debug.append(sess.run(logits, feed_dict={x_ph: o})[0]) # save and log buf.store(o, a, r, v_t, logp_t) # buf.store(o.board.reshape(1,-1), a, r, v_t, logp_t) # obs, act, rew, val, logp logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) if gym_or_pyco == 'pyco': o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) else: o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) if r is None: ep_ret += 0 r = 0 else: ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): num_ep += 1 last_val = r if d else sess.run(v, feed_dict={x_ph: o}) if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) else: last_val = 0 # if trajectory didn't reach terminal state, bootstrap value target buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) summary_ep = summary_ep + [ep_ret] # summary = tf.Summary(value=[tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep)]) # test_writer.add_summary(summary, num_ep) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if gym_or_pyco == 'gym': o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2]) else: o = rgb_input_pyco(o, obs_dim) o = o.reshape(1, obs_dim[0], obs_dim[1], 1) # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! min_summary_ep = min(summary_ep) max_summary_ep = max(summary_ep) summary_ep = np.mean(summary_ep) value_summary = np.mean(buf.val_buf) summary = tf.Summary(value=[ tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep) ]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[ tf.Summary.Value(tag="min_ep_ret", simple_value=min_summary_ep) ]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[ tf.Summary.Value(tag="max_ep_ret", simple_value=max_summary_ep) ]) test_writer.add_summary(summary, epoch) summary = tf.Summary(value=[ tf.Summary.Value(tag="mean_value", simple_value=value_summary) ]) test_writer.add_summary(summary, epoch) update(epoch) #saver = tf.train.Saver() #save_path = saver.save(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt") # If you want to reload saved variables : # with tf.Session() as sess: # Restore variables from disk. # saver.restore(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt") # since I changed my sess.run i have to reset the buffer myself: buf.get() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-2, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch #logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', average_only=True) #logger.log_tabular('EpLen', average_only=True) #logger.log_tabular('VVals', with_min_and_max=True) #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) #logger.log_tabular('LossPi', average_only=True) #logger.log_tabular('LossV', average_only=True) #logger.log_tabular('DeltaLossPi', average_only=True) #logger.log_tabular('DeltaLossV', average_only=True) #logger.log_tabular('Entropy', average_only=True) #logger.log_tabular('KL', average_only=True) #logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() # game environment obs_dim = env.observation_space.shape # get the observe dimension from environment act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #print(env.action_space) x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space) # 构建神经网络的时候,a_ph还没有 adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18) #print(logp_old_ph) #print(log_old_ph_all) # Main outputs from computation graph pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs) # 目前这里的状态和action都还是放的placeholder # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all] # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action) get_action_ops = [pi, v, logp_pi, logp_all] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph))) kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph]))) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # 两部分的loss pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl) v_loss = tf.reduce_mean((ret_ph - v) ** 2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes # 同步参数 sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # 主循环 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # save and log # 把数据放进 buffer pool 里 buf.store(o, a, r, v_t, logp_t, logp_all) logger.store(VVals=v_t) # o 应该代表observation o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! # 打完一局游戏,执行一次更新 #update() inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kld = sess.run([train_pi, kl], feed_dict=inputs) kld = mpi_avg(kld) if kld > 1.5 * target_kl: beta = 2 * beta if kld < target_kl / 1.5: beta = beta / 2 # logger.log('Early stopping at step %d due to reaching max kl.' % i) # break logger.store(StopIter=i) # 上部分的train是policy,这部分是值函数 for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def sppo(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=200, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) ########### if args.alpha == 'auto': target_entropy = 0.35 log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=tf.log(0.2)) alpha = tf.exp(log_alpha) else: alpha = args.alpha ########### # Main outputs from computation graph mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi, h] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) ###### if args.alpha == 'auto': alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(-h + target_entropy) ) # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 ) alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5) train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) # For PPO # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) # ### Scheme1: SPPO NO.2: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp) # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) # ### Scheme3: SPPO NO.3: add entropy # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp) # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv)) ### Scheme2: SPPO NO.2: add entropy min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean( tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h) v_loss = tf.reduce_mean((ret_ph - v)**2) #+(ret_ph - q)**2)/2.0 # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( h) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer( learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss) # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): if args.alpha == 'auto': sess.run(train_alpha_op, feed_dict=inputs) _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) # for _ in range(train_v_iters): # sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old), Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t, h_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a}) # SPPO NO.1: add entropy # rh = r - args.alpha * logp_t if args.alpha == 'auto': rh = r + sess.run(alpha) * h_t else: rh = r + alpha * h_t # exact entropy # save and log buf.store(o, a, rh, v_t, logp_t) logger.store(VVals=v_t) o, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # d = False if ep_len == max_ep_len else d terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = r if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # # Save model # if (epoch % save_freq == 0) or (epoch == epochs-1): # logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def __init__(self, args={}): self.bot = None if "bot" in args: bot = args["bot"] self.epoch = 0 self.step = 0 self.actor_critic = core.mlp_actor_critic self.ac_kwargs = dict(hidden_sizes=[64] * 2) self.seed = 0 self.steps_per_epoch = 10000 self.epochs = 10 self.gamma = 0.99 self.clip_ratio = 0.2 self.pi_lr = 3e-4 self.vf_lr = 1e-3 self.train_pi_iters = 80 self.train_v_iters = 80 self.lam = 0.97 self.max_ep_len = 1000 self.target_kl = 0.01 self.logger_kwargs = {} self.save_freq = 1 map_name = "unknown" if bot is not None: map_name = bot.map_name self.logger_kwargs = { "output_dir": f".\\{map_name}\\ai_data", "exp_name": "builder_ai" } self.logger = EpochLogger(**self.logger_kwargs) #self.logger.save_config(locals()) self.logger.save_config(self.__dict__) seed = self.seed seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) #env = env_fn() self.env = BuilderEnv(args={"bot": self.bot}) obs_dim = self.env.observation_space.shape act_dim = self.env.action_space.shape # Share information about action space with policy architecture self.ac_kwargs['action_space'] = self.env.action_space print(str(self.env.observation_space)) print(str(self.env.action_space)) print(str(type(self.env.observation_space))) print(str(type(self.env.action_space))) # Inputs to computation graph self.x_ph, self.a_ph = core.placeholders_from_spaces( self.env.observation_space, self.env.action_space) self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders( None, None, None) # Main outputs from computation graph self.pi, self.logp, self.logp_pi, self.v = self.actor_critic( self.x_ph, self.a_ph, **self.ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) self.all_phs = [ self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph ] # Every step, get: action, value, and logprob self.get_action_ops = [self.pi, self.v, self.logp_pi] # Experience buffer self.local_steps_per_epoch = int(self.steps_per_epoch / num_procs()) self.buf = ppo.PPOBuffer( obs_dim, act_dim, self.local_steps_per_epoch, self.gamma, self.lam ) # *2 is to create a lot of extra space in the buffer, hopefully? # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives self.ratio = tf.exp(self.logp - self.logp_old_ph) # pi(a|s) / pi_old(a|s) self.min_adv = tf.where(self.adv_ph > 0, (1 + self.clip_ratio) * self.adv_ph, (1 - self.clip_ratio) * self.adv_ph) self.pi_loss = -tf.reduce_mean( tf.minimum(self.ratio * self.adv_ph, self.min_adv)) self.v_loss = tf.reduce_mean((self.ret_ph - self.v)**2) # Info (useful to watch during learning) self.approx_kl = tf.reduce_mean( self.logp_old_ph - self.logp) # a sample estimate for KL-divergence, easy to compute self.approx_ent = tf.reduce_mean( -self.logp) # a sample estimate for entropy, also easy to compute self.clipped = tf.logical_or(self.ratio > (1 + self.clip_ratio), self.ratio < (1 - self.clip_ratio)) self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32)) print(f"pi_lr:{self.pi_lr}, pi_loss:{self.pi_loss}") # Optimizers self.train_pi = MpiAdamOptimizer(learning_rate=self.pi_lr).minimize( self.pi_loss) self.train_v = MpiAdamOptimizer(learning_rate=self.vf_lr).minimize( self.v_loss) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) # Sync params across processes self.sess.run(sync_all_params()) # Setup model saving self.logger.setup_tf_saver(self.sess, inputs={'x': self.x_ph}, outputs={ 'pi': self.pi, 'v': self.v }) self.start_time = time.time() self.o, self.r, self.d, self.ep_ret, self.ep_len = self.env.reset( args={}), 0, False, 0, 0 print(f"o:{self.o}, type:{type(self.o)}") self.epoch = 0 self.t = 0 self.load()