def ppo(env_fn,
        GUI=True,
        actor_critic=my_mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        on_policy=True,
        prev_epochs=0):
    """
    Proximal Policy Optimization (by clipping),

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        GUI : Whether or not display GUI during training.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`.

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    if GUI:
        env = env_fn("GUI", prev_epochs)
    else:
        env = env_fn("DIRECT", prev_epochs)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    sess = tf.Session()

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    # Main outputs from computation graph
    pi, logp, logp_pi, v, mu, log_std = actor_critic(x_ph, a_ph, **ac_kwargs)

    # if load_path==None:
    #     # Inputs to computation graph
    #     x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    #     # Main outputs from computation graph
    #     pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
    # else:
    #     fname = osp.join(load_path, 'tf1_save')
    #     print('\n\nLoading old model from %s.\n\n' % fname)
    #
    #     # load the things!
    #     model = restore_tf_graph(sess, fname)
    #     x_ph, a_ph = model['x'], model['a']
    #     pi, logp, logp_pi, v = model['pi'], model['logp'], model['logp_pi'], model['v']

    # Calculated through one epoch, assigned by buf's methods
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'v': v,
                              'logp': logp,
                              'logp_pi': logp_pi
                          })

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)
        # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs)

        # logp is basically the same as logp_old_ph, the error starts from 1e-6,
        # and this error is a little strange...

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        last_noise_time = 0.0
        noise = np.zeros(12)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(
                                          1,
                                          -1)})  # CHANGE THE feed_dict HERE!
            # aa = a.copy()
            # if 2.0 < env.t < 4.0:
            #     # on_policy = False
            #     if env.t - last_noise_time > 0.1:
            #         noise = np.random.uniform(-0.5 * np.pi, 0.5 * np.pi, 12)
            #         last_noise_time += 0.1
            #     a += noise
            #     logp_t = sess.run(logp, feed_dict={x_ph: o.reshape(1, -1), a_ph: a})
            # else:
            #     # on_policy = True
            #     pass
            # print("time:", env.t, a-aa)

            if not on_policy:
                a = np.array([get_action_from_target_policy(env.t)])
                logp_t = sess.run(logp,
                                  feed_dict={
                                      x_ph: o.reshape(1, -1),
                                      a_ph: a
                                  })

            env.history_buffer['last_action'] = a[0]
            for i in range(
                    25):  # Change the frequency of control from 500Hz to 20Hz
                o2, r, d, o2_dict = env.step(a[0])

            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2
            # print(ep_len, d)
            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                if d:
                    last_val = 0
                    # print(o2_dict['position'])
                    # print(np.alltrue(o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]) is True)
                    # print(np.alltrue([o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]]))
                    # print("I did it!!!")
                else:
                    # last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = 0
                buf.finish_path(last_val)
                print(ep_ret)

                # logger.store(EpRet=ep_ret+last_val, EpLen=ep_len)
                # if terminal:
                #     o, ep_ret, ep_len = env.reset(), 0, 0

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0
                last_noise_time = 0.0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()
        env.addEpoch()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()  # show the log

        if time.ctime()[-13:-11] == '09':
            break

    env.close()
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4,
        target_kl=0.01, logger_kwargs=dict(), save_freq=100,
        r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative',
        pretrain_bc_itr =0):
    """

    additional args
    d_hidden_size : hidden layer size of Discriminator
    d_batch_size : Discriminator's batch size

    r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment
    
    d_itr : The number of iteration of update discriminater 
    reward_type : GAIL reward has three type ['negative','positive', 'AIRL']
    trj_num :the number of trajectory for 
    pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing
    
    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type)
    
    
    e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',')
    e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory

    Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf

    assert e_obs.shape[1:] == obs_dim 
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()


    BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())


    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):#vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        
        std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=std_ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量
                     DeltaLossV=(v_l_new - v_l_old),
                     Std = std)

    start_time = time.time()
    o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0


    if pretrain_bc_itr>0:
        BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            buf.store_rew(r)
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if d:# if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r 
                else:
                    last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ
            
                buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use
                buf.finish_path()
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail)
        
                o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0

        # Save model
        
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, epoch)


        agent_obs , agent_act = buf.obs_buf, buf.act_buf

        d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator
        for _t in range(d_itr):
            e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size)
            a_obs_batch  =sample_batch(agent_obs,batch_size = d_batch_size)
            a_act_batch= sample_batch(agent_act,batch_size = d_batch_size)
            D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch )
        js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act)
        #---------------get_gail_reward------------------------------
        rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel()

        buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf
        for path_slice in buf.slicelist[:-1]:
            ep_ret_gail = rew_gail[path_slice].sum()
            ep_ret_sum = buf.rew_buf[path_slice].sum()
            logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail)


        buf.culculate_adv_buf()
        
        # -------------Perform PPO update!--------------------

        update()
        
        logger.store(JS=js_d)


        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('JS', average_only=True)
        #logger.log_tabular('JS_Ratio', average_only=True)    
        logger.dump_tabular()
Exemple #3
0
def ppo(env_fn,
        ref_func=None,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=500,
        epochs=10000,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=500,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    t_a_ph = core.placeholder_from_space(env.action_space)
    ret_ph = core.placeholder(None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, t_a_ph, ret_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    print("---------------", local_steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # dagger objectives
    pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            sess.run(train_pi, feed_dict=inputs)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(1, epochs + 1, 1):
        for t in range(local_steps_per_epoch):
            a_s, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            a = a_s[0]
            ref_a = call_mpc(env, ref_func)
            if (epoch < 100):
                a = ref_a

            # save and log
            buf.store(o, a, ref_a, r)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #4
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    maxRev = float("-inf")  #negative infinity in the beginning
    #maxRevActionSeq=[]
    maxRevTSTT = 0
    maxRevRevenue = 0
    maxRevThroughput = 0
    maxRevJAH = 0
    maxRevRemVeh = 0
    maxRevJAH2 = 0
    maxRevRMSE_MLvio = 0
    maxRevPerTimeVio = 0
    maxRevHOTDensity = pd.DataFrame()
    maxRevGPDensity = pd.DataFrame()
    maxtdJAHMax = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu
            numpyFromA = np.array(a[0])
            numpyFromA = ((numpyFromA + 1.0) *
                          (env.state.tollMax - env.state.tollMin) /
                          2.0) + env.state.tollMin
            a[0] = np.ndarray.tolist(numpyFromA)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    #get other stats and store them too
                    otherStats = env.getAllOtherStats()
                    if np.any(np.isnan(np.array(otherStats))):
                        sys.exit("Nan found in statistics! Error")
                    logger.store(EpTSTT=otherStats[0],
                                 EpRevenue=otherStats[1],
                                 EpThroughput=otherStats[2],
                                 EpJAH=otherStats[3],
                                 EpRemVeh=otherStats[4],
                                 EpJAH2=otherStats[5],
                                 EpMLViolRMSE=otherStats[6],
                                 EpPerTimeVio=otherStats[7],
                                 EptdJAHMax=otherStats[8])
                    #determine max rev profile
                    if ep_ret > maxRev:
                        maxRev = ep_ret
                        maxRevActionSeq = env.state.tollProfile
                        maxRevTSTT = otherStats[0]
                        maxRevRevenue = otherStats[1]
                        maxRevThroughput = otherStats[2]
                        maxRevJAH = otherStats[3]
                        maxRevRemVeh = otherStats[4]
                        maxRevJAH2 = otherStats[5]
                        maxRevRMSE_MLvio = otherStats[6]
                        maxRevPerTimeVio = otherStats[7]
                        maxRevHOTDensity = env.getHOTDensityData()
                        maxRevGPDensity = env.getGPDensityData()
                        maxtdJAHMax = otherStats[8]
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpTSTT', average_only=True)
        logger.log_tabular('EpRevenue', average_only=True)
        logger.log_tabular('EpThroughput', average_only=True)
        logger.log_tabular('EpJAH', average_only=True)
        logger.log_tabular('EpRemVeh', average_only=True)
        logger.log_tabular('EpJAH2', average_only=True)
        logger.log_tabular('EpMLViolRMSE', average_only=True)
        logger.log_tabular('EpPerTimeVio', average_only=True)
        logger.log_tabular('EptdJAHMax', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    print("Max cumulative reward obtained= %f " % maxRev)
    print(
        "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f"
        %
        (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh,
         maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax))
    outputVector = [
        maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH,
        maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio,
        maxtdJAHMax
    ]
    #print("\n===Max rev action sequence is\n",maxRevActionSeq)
    exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector)
    exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
Exemple #5
0
def pg_linesearch(env_fn,
                  actor_critic=core.mlp_actor_critic,
                  ac_kwargs=dict(),
                  seed=0,
                  steps_per_epoch=4000,
                  epochs=50,
                  gamma=0.99,
                  pi_lr=3e-4,
                  backtrack_coeff=0.8,
                  delta=0.01,
                  backtrack_iters=1000,
                  vf_lr=1e-3,
                  train_v_iters=80,
                  lam=0.97,
                  max_ep_len=1000,
                  logger_kwargs=dict(),
                  save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # Symbols needed for CG solver
    pi_params = trpo_core.get_vars('pi')
    gradient = trpo_core.flat_grad(pi_loss, pi_params)
    #v_ph, hvp = trpo_core.hessian_vector_product(d_kl, pi_params)
    v_ph = tf.placeholder(tf.float32, shape=gradient.shape)
    ##TODO: more analysis on damping Coeff
    #if damping_coeff > 0:
    #hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = trpo_core.flat_concat(pi_params)
    set_pi_params = trpo_core.assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """

        ##TODO: Next Step is to try the hessian
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        cg_iters = 20
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        #TODO: Next step is to calculate the hessian using safe distance
        #Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old, ent = sess.run(
            [gradient, pi_loss, v_loss, approx_ent], feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)
        #x = cg(Hx, g)
        #x = optimize.fmin_cg(pi_l_old, x0, fprime=g)
        x = g
        old_params = sess.run(get_pi_params)
        old_penalty = env.penalty(env.s)
        alpha = np.sqrt(2 * delta / (np.dot(x, g) + EPS))

        # backtracking line search, hard constraint check on env penalty
        for j in range(backtrack_iters):
            step = backtrack_coeff**j
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            pi_l_new = sess.run([pi_loss], feed_dict=inputs)
            penalty = env.penalty(env.s)
            #print("Old Penalty {}, Penalty {}".format(old_penalty,penalty))

            if penalty == 0 or penalty < old_penalty:
                #if pi_l_new <= pi_l_old:
                logger.log('Accepting new params at step %d of line search.' %
                           j)
                logger.store(BacktrackIters=j)
                logger.store(penalty=penalty, old_penalty=old_penalty)
                break

            if j == backtrack_iters - 1:
                logger.log('Line search failed! Keeping old params.')
                logger.store(BacktrackIters=j)
                logger.store(penalty=penalty, old_penalty=old_penalty)

        # Policy gradient step
        #sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        #pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict={v_ph: old_params - alpha * x * step})
        logger.store(LossPi=pi_l_old,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', average_only=True)
        logger.log_tabular('penalty', average_only=True)
        logger.log_tabular('old_penalty', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #6
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        explorer=None,
        eps=.03,
        pretrain_epochs=0):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    total_epochs = epochs + pretrain_epochs

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(total_epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # explore if you are in a pretrain epoch or if eps-greedy
            pre = epoch < pretrain_epochs
            during = random.random() < eps
            if pre or during:
                if explorer is None:
                    raise ValueError('Trying to explore but explorer is None')
                state = env.env.state_vector()
                a = explorer.sample_action(state)

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def ppo(BASE_DIR,
        expert_density,
        env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        steps_per_epoch=1000,
        epochs=10,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=50,
        train_v_iters=50,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        data_n=10):

    data = {}  # ALL THE DATA

    logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # update rule
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(
        -2.0, 2.0, size=env.action_space.shape)  # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                       args.iter_length)
    density = policy_distr.density()

    data[0] = {
        'pol_s': policy_distr.num_samples,
        'pol_t': policy_distr.num_trajects
    }

    dist_rewards = []

    # repeat REIL for given number of rounds
    for i in range(args.rounds):

        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        reward = lambda s: expert_density(s) / (density(s) + args.eps)

        dist_rewards.append(reward)

        start_time = time.time()
        o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        r = reward(o)  # custom reward

        # Main loop: collect experience in env and update/log each epoch
        for epoch in range(epochs):
            for t in range(local_steps_per_epoch):

                a, v_t, logp_t = sess.run(get_action_ops,
                                          feed_dict={x_ph: o.reshape(1, -1)})

                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                o, old_r, d, _ = env.step(a[0])
                r = reward(o)
                ep_ret += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (t == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = old_r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = reward(o)
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    r = reward(o)

            # store model!
            if (epoch == epochs - 1): logger.save_state({'env': env}, None)

            # Perform PPO update!
            update()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts',
                               (epoch + 1) * steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            print(message)

        policy = lambda state: sess.run(
            get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0]
        data[i] = {
            'pol_s': policy_distr.num_samples,
            'pol_t': policy_distr.num_trajects
        }
        data[i]['rewards'] = evaluate_reward(env, policy, data_n)

        if i != args.rounds - 1:
            policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                               args.iter_length)
            density = policy_distr.density()

    return data, dist_rewards
Exemple #8
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=33,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.998,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=60,
        train_v_iters=60,
        lam=0.95,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    ## Logger setup
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    ## Random seed setting
    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    ## Environment instantiation
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    #Policies vector (only one for this project)
    policies = []

    #TensorFlow session
    sess = tf.Session()

    #Build policy anc value networks
    MAP = MAPolicy(scope='policy_0',
                   ob_space=env.observation_space,
                   ac_space=env.action_space,
                   network_spec=pi_specs,
                   normalize=True,
                   v_network_spec=v_specs)
    policies = [MAP]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Create aux placeholders for the computation graph
    adv_ph, ret_ph, logp_old_ph = core.placeholders(1, 1, 1)

    # Get main placeholders for the computation graph
    map_phs_dict = MAP.phs
    map_phs = [v for k, v in map_phs_dict.items()]

    for k, v in map_phs_dict.items():
        if v.name == None:
            v.name = k

    # Append aux and main placeholders
    # Need placeholders in *this* order later (to zip with data from buffer)
    new_phs = [adv_ph, ret_ph, logp_old_ph]
    all_phs = np.append(map_phs, new_phs)

    # Intantiate Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['policy_net', 'vpred_net'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(MAP.taken_action_logp -
                   logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)  # PPO-clip limits
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph,
                                         min_adv))  # Policy loss function
    v_loss = tf.reduce_mean(
        (ret_ph - MAP.scaled_value_tensor)**2)  # Value loss function

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph - MAP.taken_action_logp
    )  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -MAP.taken_action_logp
    )  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(
        ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)
    )  # a logical value which states whether there was clipping
    clipfrac = tf.reduce_mean(tf.cast(
        clipped, tf.float32))  # a measure of clipping for posterior analysis

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(
        pi_loss)  #Policy network optimizer
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(
        v_loss)  #Value network optimizer

    #initialize TensorFlow variabels
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Set up logger variables to be saved (it is necessary to save everything that is
    # input/output to the networks so that the policy can be played afterwards during testing)
    out_act_dict = MAP.sampled_action
    out_state_dict = MAP.state_out
    logger_outputs = {**out_act_dict, **out_state_dict}

    for k, v in logger_outputs.items():
        if 'lstm' in k:
            logger_outputs[k + '_out'] = logger_outputs.pop(k)

    logger_inputs = map_phs_dict

    logger.setup_tf_saver(sess, inputs=logger_inputs, outputs=logger_outputs)

    # ======================================================================== #
    # ===================== Auxiliary Training Functions ===================== #
    # ======================================================================== #

    # Compute metrics for analysis during and after training
    def compute_metrics(extra_dict={}):

        loss_outs = {
            'pi_loss': pi_loss,
            'v_loss': v_loss,
            'approx_ent': approx_ent,
            'approx_kl': approx_kl,
            'approx_cf': clipfrac,
            'taken_action_logp': MAP.taken_action_logp,
            'ratio': ratio,
            'min_adv': min_adv
        }

        out_loss = policies[0].sess_run(buf.obs_buf,
                                        sess_act=sess,
                                        extra_feed_dict=extra_dict,
                                        other_outputs=loss_outs,
                                        replace=True)

        return out_loss['pi_loss'], out_loss['v_loss'], out_loss[
            'approx_ent'], out_loss['approx_kl'], out_loss['approx_cf']

    # ======================================================================= #

    # Run session on policy and value optimizers for training their respective networks
    def train(net, extra_dict={}):

        if net == 'pi':
            train_outs = {'train_pi': train_pi, 'approx_kl': approx_kl}
        elif net == 'v':
            train_outs = {'train_v': train_v}
        else:
            print("Error: Network not defined")
            return

        out_train = policies[0].sess_run(buf.obs_buf,
                                         sess_act=sess,
                                         extra_feed_dict=extra_dict,
                                         other_outputs=train_outs,
                                         replace=True)
        if net == 'pi':
            return out_train['approx_kl']

    # ======================================================================= #

    # Perform training procedure
    def update():

        print("======= update!")

        #get aux data from the buffer and match it with its respective placeholders
        buf_data = buf.get(aux_vars_only=True)
        aux_inputs = {k: v for k, v in zip(new_phs, buf_data)}

        #for the training, the actions taken during the experience loop are also inputs to the network
        extra_dict = {k: v for k, v in buf.act_buf.items() if k is not 'vpred'}

        for k, v in extra_dict.items():
            if k == 'action_movement':
                extra_dict[k] = np.expand_dims(v, 1)

        #actions and aux variables from the buffer are joined and passed to compute_metrics (observations are joined within the functions)
        extra_dict.update(aux_inputs)
        pi_l_old, v_l_old, ent, kl, cf = compute_metrics(extra_dict)

        # Policy training loop
        for i in range(train_pi_iters):
            if i % 10 == 0:
                print("training pi iter ", i)
            kl = train('pi', extra_dict)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break

        logger.store(StopIter=i)
        print("")

        # Value training loop
        for j in range(train_v_iters):
            if j % 10 == 0:
                print("training v iter ", j)
            train('v', extra_dict)

        # Log changes from update with a new run on compute_metrics
        pi_l_new, v_l_new, ent, kl, cf = compute_metrics(extra_dict)

        #Store information
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        #Reset experience varibales
        o, ep_ret, ep_len = env.reset(), 0, 0

        #Reset policy
        for policy in policies:
            policy.reset()

        print("======= update finished!")

    # ======================================================================= #

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # ======================================================================= #
    # ========================== Experience Loop ============================ #
    # ======================================================================= #

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        print("epoch: ", epoch)
        for t in range(local_steps_per_epoch):

            # Pass observations through networs and get action + predicted value
            if len(policies) == 1:  #this project's case
                a, info = policies[0].sess_run(o, sess_act=sess)
                v_t = info['vpred']
                logp_t = info['ac_logp']
            else:
                o = splitobs(o, keepdims=False)
                ob_policy_idx = np.split(np.arange(len(o)), len(policies))
                actions = []
                for i, policy in enumerate(policies):
                    inp = itemgetter(*ob_policy_idx[i])(o)
                    inp = listdict2dictnp([inp] if ob_policy_idx[i].shape[0] ==
                                          1 else inp)
                    ac, info = policy.act(inp)
                    actions.append(ac)
                action = listdict2dictnp(actions, keepdims=True)

            # Take a step in the environment
            o2, r, d, env_info = env.step(a)
            ep_ret += r
            ep_len += 1

            # If env.render is uncommented, the experience loop is displayed (visualized)
            # in real time (much slower, but excelent debugging)

            # env.render()

            # save experience in buffer and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            # Treat the end of a trajectory
            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1) or env_info.get(
                    'discard_episode', False):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                if d:
                    last_val = 0
                else:
                    _, info = policies[0].sess_run(o, sess_act=sess)
                    last_val = info['vpred']

                #Compute advantage estimates and rewards-to-go
                buf.finish_path(last_val)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)

                o, ep_ret, ep_len = env.reset(), 0, 0

                for policy in policies:
                    policy.reset()

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            print("Saved epoch: ", epoch)
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #9
0
def impala(gym_or_pyco, env_fn, ac_kwargs=dict(), n=4, logger_kwargs=dict(), actor_critic=core.mlp_actor_critic, num_cpu=1, epochs=200, max_ep_len=300,
           steps_per_epoch=4000, gamma=0.99, seed=73,vf_lr=1e-3, pi_lr = 3e-4, entropy_cost = 0.00025, baseline_cost = .5, rho_bar = 1, c_bar = 1, train_pi_iters=80,train_v_iters=80,
           export_dir="/home/clement/Documents/spinningup_instadeep/data/cmd_impala/cmd_impala_s0/simple_save",
           tensorboard_path = '/home/clement/spinningup/tensorboard'):

    dict_continous_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar',
                          'Assault', 'Asterix', 'Asteroids', 'Atlantis',
                          'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout',
                          'Carnival',
                          'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk',
                          'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar',
                          'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
                          'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan',
                          'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing',
                          'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
                          'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink']
    dict_discrete_gym = []
    dict_gym = ['CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid', 'Alien', 'Amidar',
                'Assault', 'Asterix', 'Asteroids', 'Atlantis',
                'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing', 'Breakout', 'Carnival',
                'Centipede', 'ChopperCommand', 'CrazyClimber', 'Defender', 'Demon_attack', 'DoubleDunk',
                'ElevatorAction', 'Enduro', 'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar',
                'Hero', 'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull', 'KungFuMaster',
                'MpntezumaRevenge', 'MsPacman', 'NameThisGame', 'Phoenix', 'Pitfall', 'Pooyan',
                'PrivateEye', 'Qbert', 'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'Skiing',
                'Solaris', 'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown',
                'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge', 'Zaxxon', 'Numberlink']

    env = env_fn()
    proc_id()
    seed += 10000 * 3
    tf.set_random_seed(seed)
    np.random.seed(seed)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    if gym_or_pyco == 'gym':
        None
    else:
        env = env()

    obs_dim = env.observation_space.shape
    if env.action_space == 4:
        act_dim = env.action_space
    try:
        act_dim = env.action_space.n
    except:
        act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph

    if gym_or_pyco == 'pyco':
        x_ph = tf.placeholder(tf.float32, shape=(None, obs_dim[0], obs_dim[1], 1))
    else:
        x_ph = tf.placeholder(tf.float32, shape=(1, obs_dim[0], obs_dim[1], obs_dim[2]))
    # a_ph = core.placeholders_from_spaces(env.action_space)
    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        a_ph = tf.placeholder(tf.uint8, shape=(1))

    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        a_ph = tf.placeholder(tf.float32, shape=(env.action_space.shape[0]))

    else:
        a_ph = tf.placeholder(tf.int32, shape=(None))

    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy',
                                                    action_space=env.action_space.n)
    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, policy='baseline_gaussian_policy',
                                            action_space=env.action_space.shape[0])
    else:
        pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='baseline_categorical_policy',
                                                    action_space=env.action_space.n)
    adv_ph, pi_act_ph, logp_old_ph, v_trace_ph = core.placeholders(None, None, None, None)
    advantages = tf.stop_gradient(adv_ph)
    all_phs = [x_ph, a_ph, adv_ph, pi_act_ph]

    # every steps, get : action, value and logprob.
    get_action_ops = [pi, v, logp_pi]
    logits_op = [logits]



    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # need to get rho_param from the v_trace function..

    c_param = tf.minimum(tf.exp(logp - logp_old_ph) ,c_bar)
    rho_param = tf.minimum(tf.exp(logp - logp_old_ph) ,rho_bar)

    def compute_baseline_loss(v_trace_ph, v):
        # Loss for the baseline, summed over the time dimension.
        # Multiply by 0.5 to match the standard update rule:
        # d(loss) / d(baseline) = advantage
        return .5 * tf.reduce_sum(tf.square(v_trace_ph - v))

    def compute_entropy_loss(logits):
        policy = tf.nn.softmax(logits)
        log_policy = tf.nn.log_softmax(logits)
        entropy_per_timestep = tf.reduce_sum(-policy * log_policy, axis=-1)
        return -tf.reduce_sum(entropy_per_timestep)

#advantages = adv_buf[i]

    def compute_policy_gradient_loss(logits, advantages, a=all_phs[1]):
        #actions = tf.one_hot(a,depth=act_dim)
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=a, logits=logits)
        advantages = tf.stop_gradient(advantages)
        policy_gradient_loss_per_timestep = cross_entropy * advantages
        return tf.reduce_sum(policy_gradient_loss_per_timestep)

    total_loss = compute_entropy_loss(logits) * entropy_cost + compute_baseline_loss(v_trace_ph, v) * baseline_cost + \
                 compute_policy_gradient_loss(logits, adv_ph, all_phs[1])

    #pi_loss = tf.reduce_mean(adv_ph*rho_param)

    #v_loss = tf.reduce_mean((v_trace_ph - v) ** 2)

    def v_trace(obs_list, rews_list, act_list, logp_list, gamma, c_param, rho_param, v, obs_dim1, obs_dim2, last_obs_buf, sess):
        """Prend en entrée les trajectoires et les rewards associés, renvoie un dictionaire associé à des states : à un state x_s est associé un scalaire v_{x_s}
        les trajectoires seront une liste de trajectoires

        Args:
           obs_list: a list of different paths observations used for v_trace.
           rews_list: the list of the rewards lists from each of every paths used for v_trace.
           act_list: a list of the actions lists from each of every paths used for v_trace.
           logp_list: a vector of log probabilities log(p_old(a|s)) used for v_trace.
           gamma : hyperparam in v_trace and GAE
           c_param : a placeholder to be fed.
           rho_param : a placeholder to be fed
           v : a tf function for the value. Depends on x_ph and a_ph
           obs_dim1 : size of rows for board
           obs_dim2 : size of cols for board
           sess: contains the up to date policy of the graph from the learner at the time of computing v_trace.
        """

        size_obs = len(obs_list)
        v_tr = np.zeros(size_obs+1)

        c_param = sess.run([c_param],feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list})[0]
        c_param[-1] = 1
        rho_param = sess.run([rho_param], feed_dict={x_ph: obs_list, a_ph: act_list, logp_old_ph: logp_list})
        #v_tr[-1] = sess.run([v],feed_dict={x_ph: np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1))}) + rews_list[-1] * rho_param[0][-1]
        v_tr[-1] = last_val_buf
        last_obs = np.reshape(obs_list[-1], (1, obs_dim1, obs_dim2, 1))
        v_tr[-2] = sess.run([v],feed_dict={x_ph: last_obs})[0]+rho_param[0][-1]*(rews_list[-1] + gamma * sess.run([v],feed_dict={x_ph: last_obs_buf})[0]- sess.run([v],feed_dict={x_ph: last_obs})[0]) + gamma * c_param[-1] *(v_tr[-1] - sess.run([v],feed_dict={x_ph: last_obs_buf})[0] )
        for i in range(size_obs-1):
            obs_t_1 = np.reshape(obs_list[size_obs-2-i], (1, obs_dim1, obs_dim2, 1))
            obs_t = np.reshape(obs_list[size_obs-i-1],(1,obs_dim1, obs_dim2, 1))
            v_tr[size_obs-2-i] = sess.run([v],feed_dict={x_ph: obs_t_1})[0]+rho_param[0][size_obs-2-i]*(rews_list[size_obs-2-i] + gamma * sess.run([v], feed_dict={x_ph: obs_t})[0]- sess.run([v],feed_dict={x_ph: obs_t_1})[0]) + gamma * c_param[size_obs-2-i] *(v_tr[size_obs-i-1] - sess.run([v], feed_dict={x_ph: obs_t})[0] )
        return v_tr

    # with adv_ph the advantage with v_trace. On the whole thing?..
    with tf.name_scope('pi_loss'):
        #core.variable_summaries(pi_loss)
        core.variable_summaries(total_loss)

    # Optimizers
    #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    #train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
    #train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(total_loss)
    total_env_frames=1e6
    momentum=0.
    epsilon=.1
    decay=.99

    #tf.get_variable(
    #    'num_environment_frames',
    #    initializer=tf.zeros_initializer(),
    #    shape=[],
    #    dtype=tf.float32,
    #    trainable=False,
    #    collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

    #num_env_frames = tf.train.get_global_step()
    #learning_rate = tf.train.polynomial_decay(pi_lr, num_env_frames,
    #                                          total_env_frames, 0)
    global_step = 100
    starter_learning_rate = 3e-4
    end_learning_rate=3e-5
    decay_steps=5e2
    learning_rate = tf.compat.v1.train.polynomial_decay(starter_learning_rate, global_step,
                                                        decay_steps, 0)

    optimizer = tf.train.RMSPropOptimizer(learning_rate, decay,
                                          momentum, epsilon)
    train_pi = optimizer.minimize(total_loss)

    sess = tf.Session()
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_path + '/train',
                                         sess.graph)
    test_writer = tf.summary.FileWriter(tensorboard_path + '/test')

    sess.run(tf.global_variables_initializer())
    sess.run(sync_all_params())
    # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi':pi, 'v': v})


    def update(adv_buf, obs_list, act_list, logp_list):
        #pi_l_old, v_l_old = sess.run([pi_loss, v_loss],feed_dict={x_ph
        #
        # : obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]})
        for i in range(n):
            for _ in range(train_pi_iters):
                sess.run(train_pi, feed_dict ={x_ph: obs_list[i], a_ph: act_list[i], logp_old_ph: logp_list[i], adv_ph: adv_buf[i], v_trace_ph: v_trace_list[i][:-1]})
            #for _ in range(train_v_iters):
            #    sess.run(train_v,feed_dict={x_ph: obs_list[i], a_ph: act_list[i], v_trace_ph: v_trace_list[i][:-1]})
        #pi_l_new, v_l_new = sess.run([pi_loss, v_loss],feed_dict={x_ph: obs_list[0], a_ph: act_list[0], logp_old_ph: logp_list[0], v_trace_ph: v_trace_list[0][:-1], adv_ph: adv_buf[0]})
        #logger.store(LossPi=pi_l_old, LossV=v_l_old, DeltaLossPi=(pi_l_new-pi_l_old), DeltaLossV=(v_l_new - v_l_old))


    saver = tf.train.Saver()
    save_path = saver.save(sess,export_dir)

    for epoch in range(epochs):
        # Begins collecting trajectories and computing v_traces, adv.
        obs_list = []
        rew_list = []
        act_list = []
        val_list = []
        logp_list = []
        v_trace_list = []
        adv_buf = []
        actors = [Actor(x_ph, a_ph, np.random.random_integers(0, high=39239, size=1)[0]) for i in range(n)]
        ep_len = []
        last_rew_list = []
        for i in range(n):
            actors[i].load_last_weights(export_dir)
            obs_buf, act_buf, rew_buf, val_buf, logp_buf, last_rew_buf, last_val_buf, last_obs_buf = actors[i].get_episode(env,get_action_ops,gym_or_pyco,obs_dim)
            obs_buf = np.reshape(obs_buf, (np.shape(obs_buf)[0], obs_dim[0], obs_dim[1], 1))
            ep_len.append(len(obs_buf))
            last_rew_list = np.append(last_rew_list, last_rew_buf)
            logp_buf = np.reshape(logp_buf, (np.shape(logp_buf)[0]))
            obs_list.append(obs_buf)
            rew_list.append(rew_buf)
            act_list.append(act_buf)
            val_list.append(val_buf)
            logp_list.append(logp_buf)
            v_trace_list.append(v_trace(obs_list[i], rew_list[i], act_list[i], logp_list[i], gamma, c_param, rho_param, v, obs_dim[0], obs_dim[1], last_obs_buf, sess))
            rews = np.append(rew_list[i], last_rew_buf)
            vals = np.append(val_list[i], last_val_buf)
            adv = rews[:-1] + gamma * v_trace_list[i][1:] - vals[:-1]
            # normalization of adv:
            adv_mean, adv_std = mpi_statistics_scalar(adv)
            adv = (adv - adv_mean) / (adv_std + 1e-5)
            adv_buf.append(adv)

        update(adv_buf, obs_list, act_list, logp_list)
        saver = tf.train.Saver()
        save_path = saver.save(sess,
                               export_dir)
        EpRet = []
        for k in range(n):
            EpRet = np.append(EpRet,sum(rew_list[k]))
            EpRet[-1] = EpRet[-1]+last_rew_list[k]
        logger.store(EpRet=EpRet)
        logger.store(EpLen=ep_len)
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', with_min_and_max=True)
        logger.dump_tabular()
Exemple #10
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.策略给出的动作
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.策略给出的x状态下的a动作概率
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.动作被采样的概率
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)状态x下的V值
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO. AC框架参数

        seed (int): Seed for random number generators.#随机种子数0
        
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.每轮迭代次数4000

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.轮次50

        gamma (float): Discount factor. (Always between 0 and 1.)折扣因子0.99

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 剪切比率0.2

        pi_lr (float): Learning rate for policy optimizer.策略学习率3e-4

        vf_lr (float): Learning rate for value function optimizer.评价网络学习率1e-3

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)策略最大梯度下降步数80

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.评价最大梯度下降步数80

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.) TD(lambda)中的lambda =0.97

        max_ep_len (int): Maximum length of trajectory / episode / rollout. 每次最长步长 1000

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)提前停车用,目标kl 0.01

        logger_kwargs (dict): Keyword args for EpochLogger.日志参数

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.模型保存频率每10轮

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()  #创建环境
    obs_dim = env.observation_space.shape  #读取环境维度
    act_dim = env.action_space.shape  #动作维度

    # Share information about action space with policy architecture 策略框架下共享动作有关信息
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph,
                                        **ac_kwargs)  #输入环境和动作,输出策略相关信息

    # Need all placeholders in *this* order later (to zip with data from buffer)之后需要的数据:环境,动作,优势函数、奖励和上一步的策略
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob每步得到的信息:策略、V值和动作概率
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)  #经验池

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])  #记录变量
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
               var_counts)  #日志输出策略和评价的变量数

    # PPO objectives
    ratio = tf.exp(logp -
                   logp_old_ph)  # pi(a|s) / pi_old(a|s) 比率=pi(new)/pi(old)
    min_adv = tf.where(
        adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) *
        adv_ph)  #tfwhere条件adv_ph,>0,表示优势增加,选(1+clip_ratio)*adv_ph,
    #<0,表示优势减少,选(1-clip_ratio)*adv_ph。
    pi_loss = -tf.reduce_mean(tf.minimum(
        ratio * adv_ph, min_adv))  #策略loss=比率adv与min_adv的最小者,限制策略偏移
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph - logp
    )  # a sample estimate for KL-divergence, easy to compute  KL散度的样本估计,易于计算
    approx_ent = tf.reduce_mean(
        -logp
    )  # a sample estimate for entropy, also easy to compute  熵的样本估计,易于计算
    clipped = tf.logical_or(
        ratio > (1 + clip_ratio), ratio <
        (1 - clip_ratio))  # 逻辑或运算,判定需要剪切,clipped = true/false
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))  #clipped转换为浮点数

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  # zip([x_ph, a_ph, adv_ph, ret_ph, logp_old_ph],
        #[self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] )
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)  # 输入上述数据,计算俩loss和熵

        # Training
        for i in range(train_pi_iters):  #策略迭代
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)  # 计算kl
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break  #提前停止策略训练
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)  # 训练评价网络

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac],
            feed_dict=inputs)  #重新计算loss和kl,cf
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new -
                                 v_l_old))  # 输出旧loss,kl,cf 和 delta loss

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0  #重置

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):  # 每一轮
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph:
                                                 o.reshape(1, -1)})  #根据环境给出动作

            o2, r, d, _ = env.step(a[0])  #环境交互取得奖励
            ep_ret += r  #奖励累加
            ep_len += 1  # 步长加一

            # save and log
            buf.store(o, a, r, v_t, logp_t)  #存储(环境、动作、奖励、V值、概率)经验池
            logger.store(VVals=v_t)

            # Update obs (critical!)更新环境
            o = o2

            terminal = d or (ep_len == max_ep_len)  #结束了或者到达最大步数了
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)  #轨迹被epoch在ep_len步切断
                # if trajectory didn't reach terminal state, bootstrap value target  如果轨迹没有达到终端状态,引导值目标
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #11
0
    def __init__(self,
                 env_fn,
                 seed=0,
                 steps_per_epoch=4000,
                 epochs=50,
                 gamma=0.99,
                 clip_ratio=0.2,
                 pi_lr=3e-4,
                 vf_lr=1e-3,
                 train_pi_iters=80,
                 train_v_iters=80,
                 lam=0.97,
                 max_ep_len=1000,
                 target_kl=0.01,
                 save_freq=10,
                 exp_name='',
                 test_agent=False,
                 load=False):
        # seed
        self.seed = seed
        self.seed += 10000 * proc_id()
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        random.seed(self.seed)

        # Hyper-parameter
        self.env = env_fn()
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.shape
        self.gamma, self.lam = gamma, lam
        self.steps_per_epoch, self.epochs = steps_per_epoch, epochs

        self.pi_lr, self.vf_lr = pi_lr, vf_lr
        self.clip_ratio, self.target_kl = clip_ratio, target_kl
        self.train_pi_iters, self.train_v_iters = train_pi_iters, train_v_iters
        self.exp_name, self.save_freq, self.max_ep_len = exp_name, save_freq, max_ep_len

        if test_agent:
            from visdom import Visdom
            self.viz = Visdom()
            assert self.viz.check_connection()
            self.win = self.viz.matplot(plt)

        # Experience buffer
        self.buf = PPOBuffer(self.obs_dim, self.act_dim, self.steps_per_epoch,
                             self.gamma, self.lam)

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf.Session(config=config, graph=self.graph)

            self.__make_model()
            self.sess.run(tf.global_variables_initializer())

            # Sync params across processes
            self.sess.run(sync_all_params())
            Count_Variables()
            print('Trainable_variables:')
            for v in tf.compat.v1.trainable_variables():
                print('{}\t  {}'.format(v.name, str(v.shape)))

            var_list = tf.global_variables()
            self.saver = tf.compat.v1.train.Saver(var_list=var_list,
                                                  max_to_keep=1)

            # summary
            self.writer = tf.compat.v1.summary.FileWriter("logs/" + exp_name)
            if load:
                self.load()
            self.sess.run(sync_all_params())

            self.ep_ret_ph = tf.placeholder(tf.float32,
                                            shape=(),
                                            name="ep_ret_ph")
            self.ep_Entropy_ph = tf.placeholder(tf.float32,
                                                shape=(),
                                                name="Entropy")
            self.clipfrac_ph = tf.placeholder(tf.float32,
                                              shape=(),
                                              name="clipfrac")
            self.ep_len_ph = tf.placeholder(tf.float32,
                                            shape=(),
                                            name="ep_len_ph")
            self.test_summary = tf.compat.v1.summary.merge([
                tf.compat.v1.summary.scalar('EP_ret',
                                            self.ep_ret_ph,
                                            family='test'),
                tf.compat.v1.summary.scalar('EP_len',
                                            self.ep_len_ph,
                                            family='test')
            ])
            self.entropy_summary = tf.compat.v1.summary.merge([
                tf.compat.v1.summary.scalar('Entropy',
                                            self.ep_Entropy_ph,
                                            family='test'),
                tf.compat.v1.summary.scalar('clipfrac',
                                            self.clipfrac_ph,
                                            family='test')
            ])
Exemple #12
0
def ppo(env_fn,
  # by default, use the neural network mlp we define in core
  actor_critic=core.mlp_actor_critic,
  ac_kwargs=dict(),
  seed=0,
  steps_per_epoch=4000,
  epochs=50,
  gamma=0.99,
  clip_ratio=0.2,
  pi_lr=3e-4,
  vf_lr=1e-3,
  train_pi_iters=80,
  train_v_iters=80,
  lam=0.97,
  max_ep_len=1000,
  target_kl=0.01,
  logger_kwargs=dict(),
  save_freq=10):
  """
  "Args:
  env_fn: A function which creates a copy of the environment.
  The environment must satisfy the OpenAI Gym API.

  actor_critic: A function with takes in placeholder symbols
  for state, ``x_ph``, and action ``a_ph``, and returns the main
  outputs from the agent's Tensorflow computation graph:

  ===========  ================  ======================================
  Symbol       Shape             Description
  ===========  ================  ======================================
  ``pi``       (batch, act_dim)  | Samples actions from policy given states.
  ``logp``     (batch,)          | Gives log probability according to
                                  | the policy, of taking actions ``a_ph``
                                  | in states ``x_ph``.
  ``logp_pi``  (batch,)          | Gives log probability, according to
                                  | the policy, of the action sampled by ``pi``.
  ``v``        (batch,)          | Gives the value estimate for states
                                  | in ``x_ph``.  (Critical: make sure
                                  | to flatten this!)
  ===========  ================  ======================================" -OpenAI
  Okay, quick interruption to OpenAI documentation here.
  actor_critic is the function which interfaces with tensorflow.  It takes in
  ``x_ph`` (x placeholder), ie. a representation of the current state, and
  ``a_ph``, a representation of the some actions.  (TODO: document
  *what* these actions are).
  actor_critic runs these inputs through the tensorflow graph and returns several
  pieces of information that are relevant to PPO; these are described above.

  Back to OpenAI:
  "
  ac_kwargs (dict): Any kwargs appropriate for actor_critic function
      you provided to PPO.

  seed (int): Seed for random number generators.

  setps_per_epoch (int): Number of steps of interaction (state-action pairs)
      for the agent and the environment in each epoch.

  epochs (int): Number of epochs of interaction (equivalent to
      number of policy updates) to perform.

  gamma (float): Discount factor. (Always between 0 and 1.)

  clip_ratio (float): Hyperparameter for clipping in the policy objective.
      Roughly: how far can the new policy go from the old policy while
      still profiting (improving the objective function)? The new policy
      can still go farther than the clip_ratio says, but it doesn't help
      on the objective anymore.  (Usually small, 0.1 to 0.3.)

  pi_lr (float): Learning rate for policy optimizer.

  vf_lr (float): Learning rate for value function optimizer.

  train_pi_iters (int): Maximum number of gradient descent steps to take
      on policy loss per epoch.  (Early stopping may cause optimizer
      to take fewer than this.)

  train_v_iters (int): Number of gradient descent steps to take on
      value funciton per epoch.

  lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
      close to 1).

  max_ep_len (int): Maximum length of trajectory / episode / rollout.

  target_kl (float): Roughly what KL divergence we think is appropriate
      between new and old policies after an update.  This will get used
      for early stopping.  (Usually small, 0.01 or 0.05.)

  logger_kwargs (dict): Keyword args for EpochLogger.

  save_freq (int): How often (in terms of gap between epochs) to save
      the current policy and value function." - OpenAI
  """
  logger = EpochLogger(**logger_kwargs)
  logger.save_config(locals())

  # modify the seed based on the process so if
  # we run this in multiple processes
  # simultaneously we don't do the
  # exact same thing
  seed += 10000 * proc_id()
  # set up our random stuff with this seed
  tf.set_random_seed(seed)
  np.random.seed(seed)

  # create the environment
  env = env_fn()
  obs_dim = env.observation_space.shape
  act_dim = env.action_space.shape

  # tell the policy (implemented in actor_critic function) what the action space is
  ac_kwargs['action_space'] = env.action_space

  # "Inputs to computation graph" -OpenAI
  # create tensorflow placeholders for observations (x_ph), actions (a_ph),
  # advantages (adv_ph), returns (ret_ph), log probabilities
  # in the current state of the policy (logp_old_ph)
  # (old since this is used compared to the newer version of the policy
  # we are creating in the optimization step, comparing to this "old" version)
  x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
  adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

  # "Main outputs from computation graph" -OpenAI
  # essentially here we fill in the tensorflow graph so we can compute
  # the pi, logp, logp_pi, and v tensors based on the
  # x_ph and a_ph we created above
  pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

  # "Need all placeholders in *this* order later (to zip with data from buffer)" -OpenAI
  all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

  # "Every step, get: action, value, and logprob" -OpenAI
  # we later feed this list into tf.session.run()
  # to tell it to compute the value of pi, v, logp_pi
  # using the tensorflow graph we have created
  get_action_ops = [pi, v, logp_pi]

  # Experience buffer

  # number of steps per epoch per process
  local_steps_per_epoch = int(steps_per_epoch / num_procs())

  buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

  # Count the number of parameters we are gonna be training,
  # both for the policy and for the value function
  var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
  logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

  # PPO objectives
  # ratio is the ratio of two probabilities:
  # pi(a|s) / pi_old(a|s)
  # where pi(a|s) is the probability of performing action a
  # given state s GIVEN THE POLICY WHOSE PARAMETERS WE ARE CHANGING
  # DURING THE OPTIMIZATION STEP
  # and pi_old(a|s) is the probability of the policy,
  # with fixed mlp parameters after the last update,
  # performing a given state s

  # we essentially use math to find the gradient of pi(a|s) with respect
  # to the parameters of the mlp, and this is the core of how we calculate
  # the gradient of the objective function for gradient descent

  ratio = tf.exp(logp - logp_old_ph) # "pi(a|s) / pi_old(a|s)"-OpenAI

  # this min_adv, along with the tf.minimum call in the next line of code,
  # implement the PPO-clip functionality

  # NOTE: calling this `min_adv` is a bit confusing; if advantage is negative
  # this is the min value we allow the gradient descent to consider as the advantage;
  # but it is the MAX value if advantage is positive.
  min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)

  # create the functions whose gradients we wish to use for gradient descent
  # during optimization
  # for our policy optimization, it is the PPO objective; 
  # for the value function it is simply an error-squared
  # note that reduce_mean just calculates the mean of the values in the tensor;
  # ie. this gives the expected value of the loss given the experimental values we have
  pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
  v_loss = tf.reduce_mean((ret_ph - v) ** 2)

  # Info (useful to watch during learning)
  approx_kl = tf.reduce_mean(logp_old_ph - logp) # "a sample estimate for KL-divergence, easy to compute" -OpenAI
  approx_ent = tf.reduce_mean(-logp) # "a sample estimate for entropy, also easy to compute" -OpenAI
  clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
  clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # what fraction of advantages are clipped

  # Optimizers
  # These use gradient descent with the gradient of the objective
  # functions we defined above to improve parameters for pi and v
  train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
  train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

  # initialize the tensorflow computation graph's parameters
  # with values
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())

  # "Sync params across processes" -OpenAI
  sess.run(sync_all_params())

  # Setup model saving
  logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

  def update():
    # create a dictionary of values, which specify to tensorflow what
    # to input for the placeholders: tensors containing the data from
    # the trajectory we have stored in buf
    inputs = {k:v for k, v in zip(all_phs, buf.get())}

    # calculate these for logging later
    pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

    # Training
    for i in range(train_pi_iters):
      # run a training step for the policy, and estimate the kl-divergence
      # (ie. how much the policy changed) on this step
      _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
      kl = mpi_avg(kl)

      # if the kl divergence is too high, stop training on this step
      # TODO: understand better why it is important to do this
      if kl > 1.5 * target_kl:
        logger.log('Early stopping at step %d due to reaching max kl.'%i)
        break

    logger.store(StopIter=i)

    # train our value function mlp
    for _ in range(train_v_iters):
      sess.run(train_v, feed_dict=inputs)

    # "Log changes from update" -OpenAI
    # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop
    # after having calculated the same thing as pi_l_new the previous run through the loop!
    # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV
    # instead of pi_l_new and v_l_new?
    pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
    logger.store(LossPi=pi_l_old, LossV=v_l_old,
        KL=kl, Entropy=ent, ClipFrac=cf,
        DeltaLossPi=(pi_l_new - pi_l_old),
        DeltaLossV=(v_l_new - v_l_old))
    

  start_time = time.time()

  # initialize the variables we use while training
  # o = observation (env.reset() returns initial observation)
  # r = reward = (starts as 0)
  # d = done? (whether current episode in env is over)
  # ep_ret = episode return
  # ep_len = length of episode so far
  o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

  # "Main loop: collect experience in env and update/log each epoch"
  for epoch in range(epochs):
    for t in range(local_steps_per_epoch):
      
      # run the computation of the action, value function, and probability of the action
      # using the most recent observation in the x_ph slot
      a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

      # save and log
      buf.store(o, a, r, v_t, logp_t)
      logger.store(VVals=v_t)

      # take the action we computed and advance the environment
      o, r, d, _ = env.step(a[0])
      ep_ret += r
      ep_len += 1

      terminal = d or (ep_len == max_ep_len)
      if terminal or (t==local_steps_per_epoch - 1):
        if not terminal:
          print('Warning: trajectory cut off by epoch at %d steps'%ep_len)
        
        # "if trajectory didn't reach terminal state, bootstrap value target" -OpenAI
        # in other words, if the we are stopping this trajectory due to a termination
        # signal from the env, last_val = the reward from the last step, r
        # otherwise we stopped because we reached the max episode length or max local_steps_per_epoch,
        # in which ase we set last_val = estimate of the value of current state based on v function 
        # we are training
        last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
        
        buf.finish_path(last_val)

        # "only store EpRet / EpLen if trajectory finished" -OpenAI
        if terminal:
          logger.store(EpRet=ep_ret, EpLen=ep_len)

        # reset our training variables and the training environment
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # every save_freq epochs,
    # save the state of the environment
    # also save the current state of our value function model
    # and policy
    # these are automatically saved by the save_state function
    # since we have already called logger.setup_tf_saver
    if (epoch % save_freq == 0) or (epoch == epochs - 1):
      logger.save_state({'env': env}, None)

    # "Perform PPO update!"
    update()
    
    # "Log info about epoch"
    logger.log_tabular('Epoch', epoch)
    try:
      logger.log_tabular('EpRet', with_min_and_max=True)
      logger.log_tabular('EpLen', average_only=True)
    except:
      pass
    logger.log_tabular('VVals', with_min_and_max=True)
    logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
    logger.log_tabular('LossPi', average_only=True)
    logger.log_tabular('LossV', average_only=True)
    logger.log_tabular('DeltaLossPi', average_only=True)
    logger.log_tabular('DeltaLossV', average_only=True)
    logger.log_tabular('Entropy', average_only=True)
    logger.log_tabular('KL', average_only=True)
    logger.log_tabular('ClipFrac', average_only=True)
    logger.log_tabular('StopIter', average_only=True)
    logger.log_tabular('Time', time.time() - start_time)
    logger.dump_tabular()
Exemple #13
0
def vpg(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10,
        custom_h=None,
        do_checkpoint_eval=False,
        env_name=None):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # create logger for tensorboard
    tb_logdir = "{}/tb_logs/".format(logger.output_dir)
    tb_logger = Logger(log_dir=tb_logdir)

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    if custom_h is not None:
        hidden_layers_str_list = custom_h.split('-')
        hidden_layers_int_list = [int(h) for h in hidden_layers_str_list]
        ac_kwargs['hidden_sizes'] = hidden_layers_int_list

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the
    # whole GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # log tf graph
    tf.summary.FileWriter(tb_logdir, sess.graph)

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # for saving the best models and performances during train and evaluate
    best_eval_AverageEpRet = 0.0
    best_eval_StdEpRet = 1.0e20

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save.
            logger.save_state({'env': env}, epoch)

            # Evaluate and save best model
            if do_checkpoint_eval and epoch > 0:
                # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999.
                # Doing this way, I can use test_policy and plot directly to test the best models.
                # saved best models includes:
                # 1) a copy of the env
                # 2) the best rl model with parameters
                # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch
                # note that 1) and 2) are spinningup defaults, and 3) is a custom save
                best_eval_AverageEpRet, best_eval_StdEpRet = eval_and_save_best_model(
                    best_eval_AverageEpRet,
                    best_eval_StdEpRet,
                    # a new logger is created and passed in so that the new logger can leverage the directory
                    # structure without messing up the logger in the training loop
                    eval_logger=EpochLogger(
                        **dict(exp_name=logger_kwargs['exp_name'],
                               output_dir=os.path.join(logger.output_dir,
                                                       "simple_save999999"))),
                    train_logger=logger,
                    tb_logger=tb_logger,
                    epoch=epoch,
                    # the env_name is passed in so that to create an env when and where it is needed. This is to
                    # logx.save_state() error where an env pointer cannot be pickled
                    env_name=env_name,
                    get_action=lambda x: sess.run(
                        pi, feed_dict={x_ph: x[None, :]})[0])

        # Perform VPG update!
        update()

        # # # Log into tensorboard
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="EpRet",
                      with_min_and_max=True)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="EpLen",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="VVals",
                      with_min_and_max=True)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="LossPi",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="LossV",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="DeltaLossPi",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="DeltaLossV",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="Entropy",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="KL",
                      with_min_and_max=False)
        tb_logger.log_scalar(tag="TotalEnvInteracts",
                             value=(epoch + 1) * steps_per_epoch,
                             step=epoch)
        tb_logger.log_scalar(tag="Time",
                             value=time.time() - start_time,
                             step=epoch)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #14
0
    def __init__(self, env, num_agents, config):
        self.num_agent = num_agents
        self.config = config
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space
        self.is_rnn = config["network"] == "rnn"
        self.is_cnn = config["network"] == "cnn"
        config["num_agent"] = num_agents
        if isinstance(self.observation_space, gym.spaces.Box):

            if config["share_policy"]:
                pg_agent = PGAgent(
                    "shared",
                    self.observation_space,
                    gym.spaces.Discrete(self.action_space.shape[0] /
                                        self.num_agent),
                    config,
                )
                self.agents = [pg_agent for i in range(num_agents)]
            else:
                self.agents = [
                    PGAgent(
                        i,
                        self.observation_space,
                        gym.spaces.Discrete(self.action_space.shape[0] /
                                            self.num_agent),
                        config,
                    ) for i in range(num_agents)
                ]
        else:
            if config["share_policy"]:
                pg_agent = PGAgent(
                    "shared",
                    self.observation_space[0],
                    self.action_space[0],
                    config,
                )
                self.agents = [pg_agent for i in range(num_agents)]
            else:
                self.agents = [
                    PGAgent(i, self.observation_space[i], self.action_space[i],
                            config) for i in range(num_agents)
                ]
        self.local_steps_per_epoch = int(config["steps_per_epoch"] /
                                         num_procs())
        if type(self.action_space[0]) is gym.spaces.Discrete:
            action_dim = 1
        else:
            action_dim = self.action_space[0].shape[0]
        if type(self.observation_space[0]) is gym.spaces.Discrete:
            obs_dim = self.observation_space[0].n
        else:
            if self.is_cnn:
                obs_dim = self.observation_space[0].shape
            else:
                obs_dim = self.observation_space[0].shape[0]
        self.buf = new_buffer(
            num_agents,
            obs_dim,
            action_dim,
            size=self.local_steps_per_epoch,
            type=config["algo"],
            is_rnn=self.is_rnn,
            is_cnn=self.is_cnn,
            rnn_length=config["rnn_length"],
        )

        # init logger

        self.logger = Logger(self.config)

        # init session and sync params
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(sync_all_params())

        tf.io.write_graph(
            graph_or_graph_def=self.sess.graph_def,
            logdir=os.path.join(self.config["output_dir"]),
            name="model",
        )
        # save model
        self.savers = []
        with tf.device('/cpu:0'):
            for i in range(num_agents):
                vars_list = tf.compat.v1.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope="Agent_{}".format(i))
                self.savers.append(tf.train.Saver(vars_list, max_to_keep=100))

        if self.config["save_path"] is None:
            self.save_path = "/tmp/agents.pickle"
        else:
            self.save_path = self.config["save_path"]
            self.save_path += 's/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        # load model
        if self.config["load_model"]:
            self.restore(self.config["restore_path"], [0, 1])
Exemple #15
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dims = env.action_space  #[ choice.shape for choice in env.action_space.values() ]
    #act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholder(None), core.placeholder(
        None), {}
    for k in env.action_space:
        logp_old_ph[k] = core.placeholder(None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dims, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio, min_adv, pi_loss = {}, {}, {}
    for k in env.action_space:
        ratio[k] = tf.exp(logp[k] - logp_old_ph[k])  # pi(a|s) / pi_old(a|s)
        min_adv[k] = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                              (1 - clip_ratio) * adv_ph)
        pi_loss[k] = -tf.reduce_mean(tf.minimum(ratio[k] * adv_ph, min_adv[k]))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl, approx_ent, clipped, clipfrac = {}, {}, {}, {}
    for k in env.action_space:
        approx_kl[k] = tf.reduce_mean(
            logp_old_ph[k] -
            logp[k])  # a sample estimate for KL-divergence, easy to compute
        approx_ent[k] = tf.reduce_mean(
            -logp[k])  # a sample estimate for entropy, also easy to compute
        clipped[k] = tf.logical_or(ratio[k] > (1 + clip_ratio), ratio[k] <
                                   (1 - clip_ratio))
        clipfrac[k] = tf.reduce_mean(tf.cast(clipped[k], tf.float32))

    pi_loss_sum = tf.reduce_sum(list(pi_loss.values()))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss_sum)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    save_outputs = {'v': v}
    for k in env.action_space:
        save_outputs['pi_' + k] = pi[k]
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs=save_outputs)

    def update():

        inputs = {}
        for k, v in zip(all_phs, buf.get()):
            if type(k) is not dict:
                inputs[k] = v
            else:
                for k_, v_ in zip(k.values(), v.values()):
                    inputs[k_] = v_

        pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            for k in kl:
                kl[k] = mpi_avg(kl[k])
            if max(list(kl.values())) > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        sum_dict = lambda x: x if type(x) is not dict else np.sum(
            list(x.values()))

        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=sum_dict(kl),
                     Entropy=sum_dict(ent),
                     ClipFrac=sum_dict(cf),
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            o2, r, d, _ = env.step(**a)
            env.render()  #force_realtime=True)
            ep_ret += r
            #print ("frame_return: %.4f sofar_EpRet: %.4f" % (r, ep_ret))
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                print("EpRet:", ep_ret)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def main(env_fn,
         traj_dir,
         actor_critic=core.mlp_actor_critic,
         bc_itr=1000,
         ac_kwargs=dict(),
         d_hidden_size=64,
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         pi_lr=3e-4,
         vf_lr=1e-3,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=4000,
         target_kl=0.01,
         save_freq=100,
         r_env_ratio=0,
         reward_type='negative',
         trj_num=30,
         buf_size=None,
         si_update_ratio=0.02,
         js_threshold_ratio=0.5,
         js_smooth=5):
    """
    test behavior cloning
    """

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D = Discriminator(env,
                      hidden_size=d_hidden_size)  #!add Discriminator object
    D_js_m = JS_div_machine(env, hidden_size=d_hidden_size)

    e_obs = np.loadtxt(traj_dir + '/observations.csv', delimiter=',')
    e_act = np.loadtxt(traj_dir + '/actions.csv',
                       delimiter=',')  #Demo treajectory

    Sibuffer = SIBuffer(obs_dim,
                        act_dim,
                        e_obs,
                        e_act,
                        trj_num=trj_num,
                        max_size=buf_size,
                        js_smooth_num=js_smooth)  #!sibuf

    assert e_obs.shape[1:] == obs_dim
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])

    sess = tf.Session()

    BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph)

    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=bc_itr)
    # Sync params across processes
    start_time = time.time()
    o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0
    # Setup model saving

    for epoch in range(1000000):
        a, v_t, logp_t = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})

        o, r, d, _ = env.step(a[0])
        env.render()
        time.sleep(1e-3)

        ep_ret_task += r
        ep_len += 1

        terminal = d or (ep_len == max_ep_len)
        if terminal:
            print('EpRet{},EpLen{}'.format(ep_ret_task, ep_len))
            o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset(
            ), 0, False, 0, 0, 0, 0
def sigail(env_fn,
           traj_dir,
           actor_critic=core.mlp_actor_critic_add,
           ac_kwargs=dict(),
           d_hidden_size=64,
           seed=0,
           steps_per_epoch=4000,
           epochs=50,
           gamma=0.99,
           clip_ratio=0.2,
           pi_lr=3e-4,
           vf_lr=1e-3,
           train_pi_iters=40,
           train_v_iters=40,
           lam=0.97,
           max_ep_len=4000,
           beta=1e-4,
           target_kl=0.01,
           logger_kwargs=dict(),
           save_freq=100,
           r_env_ratio=0,
           d_itr=20,
           reward_type='negative',
           trj_num=20,
           buf_size=1000,
           si_update_ratio=0.02,
           js_smooth=5,
           buf_update_type='random',
           pretrain_bc_itr=0):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D = Discriminator(env, hidden_size=d_hidden_size,
                      reward_type=reward_type)  #!add Discriminator object
    D_js_m = JS_div_machine(env, hidden_size=d_hidden_size)

    e_obs = np.zeros((buf_size, obs_dim[0]))
    e_act = np.zeros((buf_size, act_dim[0]))
    Sibuffer = SIBuffer(obs_dim,
                        act_dim,
                        e_obs,
                        e_act,
                        trj_num=trj_num,
                        max_size=buf_size,
                        js_smooth_num=js_smooth)  #!sibuf
    trj_full = False
    assert e_obs.shape[1:] == obs_dim
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, pi_std, entropy, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(
        ratio * adv_ph, min_adv)) - beta * entropy  #add entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)  #ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()

    BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  #all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:  #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):  #vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)

        std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs)
        logger.store(
            LossPi=pi_l_old,
            LossV=v_l_old,
            KL=kl,
            Entropy=std_ent,
            ClipFrac=cf,
            DeltaLossPi=(pi_l_new - pi_l_old),  #更新での改善量
            DeltaLossV=(v_l_new - v_l_old),
            Std=std)

    start_time = time.time()
    o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0

    if pretrain_bc_itr > 0:
        BC.learn(Sibuffer.expert_obs,
                 Sibuffer.expert_act,
                 max_itr=pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                '''
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                '''

                #!add discriminator train
                '''#終端も加えるならアリッチャあり
                o_reshape = o.reshape(core.combined_shape(1,obs_dim))
                a_reshape = a.reshape(core.combined_shape(1,act_dim))
                agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド
                agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習
                '''
                agent_obs = buf.obs_buf[buf.path_slice()]
                agent_act = buf.act_buf[buf.path_slice()]

                #D.train(sess,e_obs,e_act ,agent_obs,agent_act)

                #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                if trj_full:
                    gail_r = 1
                else:
                    gail_r = 0
                rew_gail = gail_r * D.get_reward(
                    sess, agent_obs,
                    agent_act).ravel()  #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                ep_ret_gail += rew_gail.sum()  #!before gail_ratio
                ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail

                rew_gail_head = rew_gail[:-1]
                last_val_gail = rew_gail[-1]

                buf.rew_buf[slice(
                    buf.path_start_idx + 1,
                    buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[
                        slice(buf.path_start_idx + 1,
                              buf.ptr)]  #!add GAIL reward 最後の報酬は含まれないため長さが1短い

                if d:  # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r_env_ratio * r + last_val_gail
                else:
                    last_val = sess.run(v,
                                        feed_dict={x_ph: o.reshape(1, -1)
                                                   })  #v_last=...だったけどこれで良さげ

                buf.finish_path(
                    last_val)  #これの前にbuf.finish_add_r_vがなされていることを確認すべし
                if terminal:
                    #only store trajectory to SIBUffer if trajectory finished
                    if trj_full:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    else:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    logger.store(EpRet=ep_ret_task,
                                 EpRet_Sum=ep_ret_sum,
                                 EpRet_Gail=ep_ret_gail,
                                 EpLen=ep_len)

                o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset(
                ), 0, False, 0, 0, 0, 0

        # Save model

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, epoch)

        # Perform PPO update!
        if not (trj_full):
            M_obs_buf = Sibuffer.get_obs_trj()
        trj_full = (M_obs_buf.shape[0] >= buf_size)

        if trj_full:  #replaybufferがr_thresholdよりも大きいとき
            Sibuffer.update_main_buf(ratio_update=si_update_ratio,
                                     update_type=buf_update_type)
            M_obs_buf = Sibuffer.get_obs_trj()
            M_act_buf = Sibuffer.get_act_trj()

            d_batch_size = len(agent_obs)
            for _t in range(d_itr):
                e_obs_batch, e_act_batch = Sibuffer.get_random_batch(
                    d_batch_size)

                D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act)

                D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs,
                             e_act)  #バッファとエキスパートの距離を見るためにtrain
            js_d = D.get_js_div(sess, Sibuffer.main_obs_buf,
                                Sibuffer.main_act_buf, agent_obs, agent_act)
            js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs,
                                       e_act)

        else:
            js_d, js_d_m = 0.5, 0.5
        update()

        Sibuffer.store_js(js_d)
        logger.store(JS=js_d,
                     JS_M=js_d_m,
                     JS_Ratio=Sibuffer.js_ratio_with_random)

        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('buffer_r', Sibuffer.buffer_r_average)
        logger.log_tabular('JS', average_only=True)
        logger.log_tabular('JS_M', average_only=True)
        logger.log_tabular('JS_Ratio', average_only=True)
        logger.dump_tabular()
Exemple #18
0
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10, custom_h=None, eval_episodes=50,
        do_checkpoint_eval=False, env_name=None, eval_temp=1.0, train_starting_temp=1.0,
        env_version=None, env_input=None, target_arcs=None):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # create logger for tensorboard
    tb_logdir = "{}/tb_logs/".format(logger.output_dir)
    tb_logger = Logger(log_dir=tb_logdir)

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    if custom_h is not None:
        hidden_layers_str_list = custom_h.split('-')
        hidden_layers_int_list = [int(h) for h in hidden_layers_str_list]
        ac_kwargs['hidden_sizes'] = hidden_layers_int_list

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    temperature_ph = tf.placeholder(tf.float32, shape=(), name="init")

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, temperature_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, temperature_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v) ** 2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the
    # whole GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # log tf graph
    tf.summary.FileWriter(tb_logdir, sess.graph)

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'temperature': temperature_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, []

    # initialize variables for keeping track of BEST eval performance
    best_eval_AverageEpRet = -0.05  # a negative value so that best model is saved at least once.
    best_eval_StdEpRet = 1.0e30

    # save is used to only allow saving BEST models after half of training epochs
    save = True

    # below are used for early-stop. We early stop if
    # 1) a best model has been saved, and,
    # 2) 50 epochs have passed without a new save
    saved = False
    early_stop_count_started = False
    episode_count_after_saved = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        current_temp = _get_current_temperature(epoch, epochs, train_starting_temp)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1),
                                                                 temperature_ph: current_temp})

            # save and log
            buf.store(o, a, r, v_t, logp_t, current_temp)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            if env_version >= 4 and env.action_is_dummy:  # a is dummy action
                ep_dummy_action_count += 1
                ep_dummy_steps_normalized.append(ep_len / env.allowed_steps)

            terminal = d or (ep_len == max_ep_len)

            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1),
                                                              temperature_ph: current_temp})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    if env_version >= 4:
                        logger.store(EpDummyCount=ep_dummy_action_count)
                        logger.store(EpTotalArcs=env.adjacency_matrix.sum())
                        if len(ep_dummy_steps_normalized) > 0:
                            ep_dummy_steps_normalized = np.asarray(ep_dummy_steps_normalized, dtype=np.float32).mean()
                            logger.store(EpDummyStepsNormalized=ep_dummy_steps_normalized)

                o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, []

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save.
            # logger.save_state({'env_name': env_name}, epoch)

            # # Save a new model every save_freq and at the last epoch. Only keep one copy - the current model
            # logger.save_state({'env_name': env_name})


            # Evaluate and save best model
            if do_checkpoint_eval and epoch > 0:
                # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999.
                # Doing this way, I can use test_policy and plot directly to test the best models.
                # saved best models includes:
                # 1) a copy of the env_name
                # 2) the best rl model with parameters
                # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch
                # note that 1) and 2) are spinningup defaults, and 3) is a custom save
                best_eval_AverageEpRet, best_eval_StdEpRet, saved = eval_and_save_best_model(
                    best_eval_AverageEpRet,
                    best_eval_StdEpRet,
                    # a new logger is created and passed in so that the new logger can leverage the directory
                    # structure without messing up the logger in the training loop
                    eval_logger=EpochLogger(**dict(
                        exp_name=logger_kwargs['exp_name'],
                        output_dir=os.path.join(logger.output_dir, "simple_save999999"))),

                    train_logger=logger,
                    tb_logger=tb_logger,
                    epoch=epoch,
                    # the env_name is passed in so that to create an env when and where it is needed. This is to
                    # logx.save_state() error where an env pointer cannot be pickled
                    env_name="F{}x{}T{}_SP{}_v{}".format(env.n_plant, env.n_product, env.target_arcs, env.n_sample,
                                                         env_version) if env_version >= 3 else env_name,
                    env_version=env_version,
                    env_input=env_input,
                    render=False,  # change this to True if you want to visualize how arcs are added during evaluation
                    target_arcs=env.target_arcs,
                    get_action=lambda x: sess.run(pi, feed_dict={x_ph: x[None, :],
                                                                 temperature_ph: eval_temp})[0],
                    # number of samples to draw when simulate demand
                    n_sample=5000,
                    num_episodes=eval_episodes,
                    save=save,
                    seed=seed
                )

        # Perform PPO update!
        update()

        # # # Log into tensorboard
        log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True)
        log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True)
        log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="ClipFrac", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="StopIter", with_min_and_max=False)
        tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch)
        tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch)
        tb_logger.log_scalar(tag="epoch_temp", value=current_temp, step=epoch)
        if env_version >= 4:
            log_key_to_tb(tb_logger, logger, epoch, key="EpDummyCount", with_min_and_max=False)
            log_key_to_tb(tb_logger, logger, epoch, key="EpTotalArcs", with_min_and_max=False)

            if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0:
                log_key_to_tb(tb_logger, logger, epoch, key="EpDummyStepsNormalized", with_min_and_max=False)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('EpochTemp', current_temp)
        if env_version >= 4:
            logger.log_tabular('EpDummyCount', with_min_and_max=True)
            if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0:
                logger.log_tabular('EpDummyStepsNormalized', average_only=True)
            logger.log_tabular('EpTotalArcs', average_only=True)

        logger.dump_tabular()

        # check for early stop
        if saved:
            # start to count the episodes elapsed after a "saved" event
            early_stop_count_started = True

            # reset the count to 0
            episode_count_after_saved = 0

        else:
            # check whether we should count this episode, i.e., whether early_stop_count_started == True
            if early_stop_count_started:
                episode_count_after_saved += 1
                if episode_count_after_saved > 60:
                    logger.log('Early Stopped at epoch {}.'.format(epoch), color='cyan')
                    break
Exemple #19
0
def trpo(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         delta=0.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=0.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=0.8,
         lam=0.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo='trpo'):
    """
    Trust Region Policy Optimization 

    (with support for Natural Policy Gradient)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph, plus placeholders for old pdist (for KL)
    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph
               ] + core.values_as_sorted_list(info_phs)

    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
    get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # TRPO losses
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # Symbols needed for CG solver
    pi_params = core.get_vars('pi')
    gradient = core.flat_grad(pi_loss, pi_params)
    v_ph, hvp = core.hessian_vector_product(d_kl, pi_params)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = core.flat_concat(pi_params)
    set_pi_params = core.assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            agent_outs = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[
                1], agent_outs[2], agent_outs[3:]

            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def ppo_pyco(gym_or_pyco,
             env_fn,
             actor_critic=core.mlp_actor_critic,
             ac_kwargs=dict(),
             seed=473,
             steps_per_epoch=200,
             epochs=500,
             gamma=0.99,
             clip_ratio=0.1,
             pi_lr=1e-2,
             vf_lr=5e-3,
             train_pi_iters=80,
             train_v_iters=80,
             lam=0.97,
             max_ep_len=350,
             target_kl=0.01,
             logger_kwargs=dict(),
             save_freq=10,
             tensorboard_path='/home/clement/spinningup/tensorboard'):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
        tensorboard_path: The path to the saved graphs&scalars in tensorboard


    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    #seed += 10000 * proc_id()
    seed = 2808
    tf.set_random_seed(seed)
    np.random.seed(seed)
    dict_continous_gym = [
        'CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid',
        'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis',
        'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing',
        'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber',
        'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro',
        'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero',
        'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull',
        'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame',
        'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
        'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
        'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
        'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge',
        'Zaxxon', 'Numberlink'
    ]
    dict_discrete_gym = []

    env = env_fn()
    dict_gym = [
        'CarRacing', 'LunarLander', 'Pong', 'AirRaid', 'Adventure', 'AirRaid',
        'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 'Atlantis',
        'BankHeist', 'BattleZone', 'BeamRider', 'Berzerk', 'Bowling', 'Boxing',
        'Breakout', 'Carnival', 'Centipede', 'ChopperCommand', 'CrazyClimber',
        'Defender', 'Demon_attack', 'DoubleDunk', 'ElevatorAction', 'Enduro',
        'FishingDerby', 'Freeway', 'Frostbite', 'Gopher', 'Gravitar', 'Hero',
        'IceHockey', 'Jamesbond', 'JourneyEscape', 'Kangaroo', 'Krull',
        'KungFuMaster', 'MpntezumaRevenge', 'MsPacman', 'NameThisGame',
        'Phoenix', 'Pitfall', 'Pooyan', 'PrivateEye', 'Qbert', 'Riverraid',
        'RoadRunner', 'Robotank', 'Seaquest', 'Skiing', 'Solaris',
        'SpaceInvaders', 'StarGunner', 'Tennis', 'TimePilot', 'Tutankham',
        'UpNDown', 'Venture', 'VideoPinball', 'WizardOfWor', 'VarsRevenge',
        'Zaxxon', 'Numberlink'
    ]
    # This code is specific for pycolab
    if gym_or_pyco == 'gym':
        None
    else:
        env = env()

    obs_dim = env.observation_space.shape
    if env.action_space == 4:
        act_dim = env.action_space
    try:
        act_dim = env.action_space.n
    except:
        act_dim = env.action_space.shape

    #act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    if gym_or_pyco == 'pyco':
        x_ph = tf.placeholder(tf.float32,
                              shape=(None, obs_dim[0], obs_dim[1], 1))
    else:
        x_ph = tf.placeholder(tf.float32,
                              shape=(None, obs_dim[0], obs_dim[1], obs_dim[2]))
    # a_ph = core.placeholders_from_spaces(env.action_space)
    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        a_ph = tf.placeholder(tf.uint8, shape=(None))

    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        a_ph = tf.placeholder(tf.float32, shape=(env.action_space.shape[0]))

    else:
        a_ph = tf.placeholder(tf.uint8, shape=(None))

    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, **ac_kwargs)
    # actor_critic with relational policy
    # pi, logp, logp_pi, v, logits = actor_critic(x_ph, a_ph, policy='relational_categorical_policy', action_space = env.action_space.n,  **ac_kwargs)
    if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
        pi, logp, logp_pi, v, logits = actor_critic(
            x_ph,
            a_ph,
            policy='baseline_categorical_policy',
            action_space=env.action_space.n)
    elif gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
        pi, logp, logp_pi, v = actor_critic(
            x_ph,
            a_ph,
            policy='baseline_gaussian_policy',
            action_space=env.action_space.shape[0])
    else:
        pi, logp, logp_pi, v, logits = actor_critic(
            x_ph,
            a_ph,
            policy='baseline_categorical_policy',
            action_space=env.action_space.n)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(gym_or_pyco, obs_dim, act_dim, local_steps_per_epoch,
                    gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # tensorboard test
    with tf.name_scope('pi_loss'):
        core.variable_summaries(pi_loss)

    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    # tensorboard
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(tensorboard_path + '/train',
                                         sess.graph)
    test_writer = tf.summary.FileWriter(tensorboard_path + '/test')

    sess.run(tf.global_variables_initializer())
    # saver.restore(sess, "/home/clement/Documents/spinningup_instadeep/trained_params/model.ckpt")
    #saver.restore(sess, "/home/clement/Documents/spinningup_instadeep/data/cmd_ppo_pyco/cmd_ppo_pyco_s0/simple_save")

    #tf.reset_default_graph()
    #export_dir = "/home/clement/Documents/spinningup_instadeep/data/cmd_ppo_pyco/cmd_ppo_pyco_s0/simple_save"
    #tf.saved_model.loader.load(sess, ["serve"],export_dir)
    #sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    #logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update(epoch):
        # inputs = {k:v for k,v in zip(all_phs, buf.get())}
        if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
            pi_l_old, v_l_old, ent = sess.run(
                [pi_loss, v_loss, approx_ent],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })
        if gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
            pi_l_old, v_l_old, ent = sess.run(
                [pi_loss, v_loss, approx_ent],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })
        else:
            pi_l_old, v_l_old, ent = sess.run(
                [pi_loss, v_loss, approx_ent],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })

        # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)
        summary = tf.Summary(
            value=[tf.Summary.Value(tag="loss", simple_value=pi_l_old)])
        test_writer.add_summary(summary, epoch)

        # Training
        for i in range(train_pi_iters):
            if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):

                _, kl = sess.run(
                    [train_pi, approx_kl],
                    feed_dict={
                        logp_old_ph: buf.logp_buf,
                        x_ph: buf.obs_buf,
                        a_ph: buf.act_buf,
                        adv_ph: buf.adv_buf,
                        ret_ph: buf.ret_buf
                    })
                kl = mpi_avg(kl)
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
            if gym_or_pyco == 'gym' and isinstance(env.action_space, Box):

                _, kl = sess.run(
                    [train_pi, approx_kl],
                    feed_dict={
                        logp_old_ph: buf.logp_buf,
                        x_ph: buf.obs_buf,
                        a_ph: buf.act_buf,
                        adv_ph: buf.adv_buf,
                        ret_ph: buf.ret_buf
                    })
                kl = mpi_avg(kl)
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
            else:

                _, kl = sess.run(
                    [train_pi, approx_kl],
                    feed_dict={
                        logp_old_ph: buf.logp_buf,
                        x_ph: buf.obs_buf,
                        a_ph: buf.act_buf,
                        adv_ph: buf.adv_buf,
                        ret_ph: buf.ret_buf
                    })
                kl = mpi_avg(kl)
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break

        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
                sess.run(train_v,
                         feed_dict={
                             logp_old_ph: buf.logp_buf,
                             x_ph: buf.obs_buf,
                             a_ph: buf.act_buf,
                             adv_ph: buf.adv_buf,
                             ret_ph: buf.ret_buf
                         })
            if gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
                sess.run(train_v,
                         feed_dict={
                             logp_old_ph: buf.logp_buf,
                             x_ph: buf.obs_buf,
                             a_ph: buf.act_buf,
                             adv_ph: buf.adv_buf,
                             ret_ph: buf.ret_buf
                         })
            else:
                #sess.run(train_v, feed_dict={logp_old_ph: buf.logp_buf, x_ph: o, a_ph: a, adv_ph: buf.adv_buf,
                #                             ret_ph: buf.ret_buf})
                sess.run(train_v,
                         feed_dict={
                             logp_old_ph: buf.logp_buf,
                             x_ph: buf.obs_buf,
                             a_ph: buf.act_buf,
                             adv_ph: buf.adv_buf,
                             ret_ph: buf.ret_buf
                         })
        # Log changes from update
        if gym_or_pyco == 'gym' and isinstance(env.action_space, Discrete):
            pi_l_new, v_l_new, kl, cf = sess.run(
                [pi_loss, v_loss, approx_kl, clipfrac],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })
        if gym_or_pyco == 'gym' and isinstance(env.action_space, Box):
            pi_l_new, v_l_new, kl, cf = sess.run(
                [pi_loss, v_loss, approx_kl, clipfrac],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })
        else:
            pi_l_new, v_l_new, kl, cf = sess.run(
                [pi_loss, v_loss, approx_kl, clipfrac],
                feed_dict={
                    logp_old_ph: buf.logp_buf,
                    x_ph: buf.obs_buf,
                    a_ph: buf.act_buf,
                    adv_ph: buf.adv_buf,
                    ret_ph: buf.ret_buf
                })

        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    if gym_or_pyco == 'gym':
        o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2])
    else:
        o = rgb_input_pyco(o, obs_dim)
        o = o.reshape(1, obs_dim[0], obs_dim[1], 1)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        num_ep = 0
        summary_ep = []
        logits_debug = []
        for t in range(local_steps_per_epoch):

            # a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.board.reshape(1, -1)})
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o})
            logits_debug.append(sess.run(logits, feed_dict={x_ph: o})[0])

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            # buf.store(o.board.reshape(1,-1), a, r, v_t, logp_t)

            # obs, act, rew, val, logp
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            if gym_or_pyco == 'pyco':
                o = rgb_input_pyco(o, obs_dim)
                o = o.reshape(1, obs_dim[0], obs_dim[1], 1)
            else:
                o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2])

            if r is None:
                ep_ret += 0
                r = 0

            else:
                ep_ret += r

            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                num_ep += 1
                last_val = r if d else sess.run(v, feed_dict={x_ph: o})

                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                else:
                    last_val = 0
                # if trajectory didn't reach terminal state, bootstrap value target
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    summary_ep = summary_ep + [ep_ret]
                    # summary = tf.Summary(value=[tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep)])
                    # test_writer.add_summary(summary, num_ep)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                if gym_or_pyco == 'gym':
                    o = o.reshape(1, obs_dim[0], obs_dim[1], obs_dim[2])
                else:
                    o = rgb_input_pyco(o, obs_dim)
                    o = o.reshape(1, obs_dim[0], obs_dim[1], 1)

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        min_summary_ep = min(summary_ep)
        max_summary_ep = max(summary_ep)
        summary_ep = np.mean(summary_ep)
        value_summary = np.mean(buf.val_buf)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="mean_ep_ret", simple_value=summary_ep)
        ])
        test_writer.add_summary(summary, epoch)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="min_ep_ret", simple_value=min_summary_ep)
        ])
        test_writer.add_summary(summary, epoch)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="max_ep_ret", simple_value=max_summary_ep)
        ])
        test_writer.add_summary(summary, epoch)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="mean_value", simple_value=value_summary)
        ])
        test_writer.add_summary(summary, epoch)

        update(epoch)

        #saver = tf.train.Saver()
        #save_path = saver.save(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt")

        # If you want to reload saved variables :
        # with tf.Session() as sess:
        # Restore variables from disk.
        # saver.restore(sess, "/home/clement/Documents/spinningup/trained_params/model.ckpt")

        # since I changed my sess.run i have to reset the buffer myself:

        buf.get()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def vpg(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-2,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        #logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', average_only=True)
        #logger.log_tabular('EpLen', average_only=True)
        #logger.log_tabular('VVals', with_min_and_max=True)
        #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        #logger.log_tabular('LossPi', average_only=True)
        #logger.log_tabular('LossV', average_only=True)
        #logger.log_tabular('DeltaLossPi', average_only=True)
        #logger.log_tabular('DeltaLossV', average_only=True)
        #logger.log_tabular('Entropy', average_only=True)
        #logger.log_tabular('KL', average_only=True)
        #logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
Exemple #22
0
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()  # game environment
    obs_dim = env.observation_space.shape  # get the observe dimension from environment
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #print(env.action_space)
    x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space)  # 构建神经网络的时候,a_ph还没有
    adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18)
    #print(logp_old_ph)
    #print(log_old_ph_all)
    # Main outputs from computation graph
    pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs)  # 目前这里的状态和action都还是放的placeholder

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all]

    # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action)
    get_action_ops = [pi, v, logp_pi, logp_all]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph)))
    kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph])))
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))  # 两部分的loss
    pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl)

    v_loss = tf.reduce_mean((ret_ph - v) ** 2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # 同步参数
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # 主循环
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            # 把数据放进 buffer pool 里
            buf.store(o, a, r, v_t, logp_t, logp_all)
            logger.store(VVals=v_t)
            # o 应该代表observation
            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # 打完一局游戏,执行一次更新
        #update()

        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kld = sess.run([train_pi, kl], feed_dict=inputs)
            kld = mpi_avg(kld)
            if kld > 1.5 * target_kl:
                beta = 2 * beta
            if kld < target_kl / 1.5:
                beta = beta / 2
                # logger.log('Early stopping at step %d due to reaching max kl.' % i)
                # break
        logger.store(StopIter=i)
        # 上部分的train是policy,这部分是值函数
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Exemple #23
0
def sppo(args,
         env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=200,
         target_kl=0.01,
         logger_kwargs=dict(),
         save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    ###########
    if args.alpha == 'auto':
        target_entropy = 0.35

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=tf.log(0.2))
        alpha = tf.exp(log_alpha)
    else:
        alpha = args.alpha
    ###########

    # Main outputs from computation graph
    mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph,
                                                  **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi, h]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    ######

    if args.alpha == 'auto':
        alpha_loss = tf.reduce_mean(
            -log_alpha * tf.stop_gradient(-h + target_entropy)
        )  # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 )

        alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5)
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])

    ######

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)

    # For PPO
    # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # ### Scheme1: SPPO NO.2: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp)
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    # ### Scheme3: SPPO NO.3: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    ### Scheme2: SPPO NO.2: add entropy
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(
        tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h)

    v_loss = tf.reduce_mean((ret_ph - v)**2)  #+(ret_ph - q)**2)/2.0

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        h)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(
        learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss)
    # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            if args.alpha == 'auto':
                sess.run(train_alpha_op, feed_dict=inputs)
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # for _ in range(train_v_iters):
        #     sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old),
                     Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, h_t = sess.run(get_action_ops,
                                           feed_dict={x_ph: o.reshape(1, -1)})
            # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a})
            # SPPO NO.1: add entropy
            # rh = r - args.alpha * logp_t
            if args.alpha == 'auto':
                rh = r + sess.run(alpha) * h_t
            else:
                rh = r + alpha * h_t  # exact entropy

            # save and log
            buf.store(o, a, rh, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r

            ep_len += 1
            # d = False if ep_len == max_ep_len else d

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # # Save model
        # if (epoch % save_freq == 0) or (epoch == epochs-1):
        #     logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Alpha', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    def __init__(self, args={}):
        self.bot = None
        if "bot" in args:
            bot = args["bot"]

        self.epoch = 0
        self.step = 0

        self.actor_critic = core.mlp_actor_critic
        self.ac_kwargs = dict(hidden_sizes=[64] * 2)
        self.seed = 0
        self.steps_per_epoch = 10000
        self.epochs = 10
        self.gamma = 0.99
        self.clip_ratio = 0.2
        self.pi_lr = 3e-4
        self.vf_lr = 1e-3
        self.train_pi_iters = 80
        self.train_v_iters = 80
        self.lam = 0.97
        self.max_ep_len = 1000
        self.target_kl = 0.01
        self.logger_kwargs = {}
        self.save_freq = 1

        map_name = "unknown"
        if bot is not None:
            map_name = bot.map_name
        self.logger_kwargs = {
            "output_dir": f".\\{map_name}\\ai_data",
            "exp_name": "builder_ai"
        }

        self.logger = EpochLogger(**self.logger_kwargs)

        #self.logger.save_config(locals())
        self.logger.save_config(self.__dict__)

        seed = self.seed
        seed += 10000 * proc_id()
        tf.set_random_seed(seed)
        np.random.seed(seed)

        #env = env_fn()
        self.env = BuilderEnv(args={"bot": self.bot})
        obs_dim = self.env.observation_space.shape
        act_dim = self.env.action_space.shape

        # Share information about action space with policy architecture
        self.ac_kwargs['action_space'] = self.env.action_space

        print(str(self.env.observation_space))
        print(str(self.env.action_space))

        print(str(type(self.env.observation_space)))
        print(str(type(self.env.action_space)))

        # Inputs to computation graph
        self.x_ph, self.a_ph = core.placeholders_from_spaces(
            self.env.observation_space, self.env.action_space)
        self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders(
            None, None, None)

        # Main outputs from computation graph
        self.pi, self.logp, self.logp_pi, self.v = self.actor_critic(
            self.x_ph, self.a_ph, **self.ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        self.all_phs = [
            self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph
        ]

        # Every step, get: action, value, and logprob
        self.get_action_ops = [self.pi, self.v, self.logp_pi]

        # Experience buffer
        self.local_steps_per_epoch = int(self.steps_per_epoch / num_procs())
        self.buf = ppo.PPOBuffer(
            obs_dim, act_dim, self.local_steps_per_epoch, self.gamma, self.lam
        )  # *2 is to create a lot of extra space in the buffer, hopefully?

        # Count variables
        var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                        var_counts)

        # PPO objectives
        self.ratio = tf.exp(self.logp -
                            self.logp_old_ph)  # pi(a|s) / pi_old(a|s)
        self.min_adv = tf.where(self.adv_ph > 0,
                                (1 + self.clip_ratio) * self.adv_ph,
                                (1 - self.clip_ratio) * self.adv_ph)
        self.pi_loss = -tf.reduce_mean(
            tf.minimum(self.ratio * self.adv_ph, self.min_adv))
        self.v_loss = tf.reduce_mean((self.ret_ph - self.v)**2)

        # Info (useful to watch during learning)
        self.approx_kl = tf.reduce_mean(
            self.logp_old_ph -
            self.logp)  # a sample estimate for KL-divergence, easy to compute
        self.approx_ent = tf.reduce_mean(
            -self.logp)  # a sample estimate for entropy, also easy to compute
        self.clipped = tf.logical_or(self.ratio > (1 + self.clip_ratio),
                                     self.ratio < (1 - self.clip_ratio))
        self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32))

        print(f"pi_lr:{self.pi_lr}, pi_loss:{self.pi_loss}")

        # Optimizers
        self.train_pi = MpiAdamOptimizer(learning_rate=self.pi_lr).minimize(
            self.pi_loss)
        self.train_v = MpiAdamOptimizer(learning_rate=self.vf_lr).minimize(
            self.v_loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        # Sync params across processes
        self.sess.run(sync_all_params())

        # Setup model saving
        self.logger.setup_tf_saver(self.sess,
                                   inputs={'x': self.x_ph},
                                   outputs={
                                       'pi': self.pi,
                                       'v': self.v
                                   })

        self.start_time = time.time()
        self.o, self.r, self.d, self.ep_ret, self.ep_len = self.env.reset(
            args={}), 0, False, 0, 0

        print(f"o:{self.o}, type:{type(self.o)}")

        self.epoch = 0
        self.t = 0

        self.load()