Ejemplo n.º 1
0
def ppo(BASE_DIR,
        expert_density,
        env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        steps_per_epoch=1000,
        epochs=10,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=50,
        train_v_iters=50,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        data_n=10):

    data = {}  # ALL THE DATA

    logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # update rule
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(
        -2.0, 2.0, size=env.action_space.shape)  # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                       args.iter_length)
    density = policy_distr.density()

    data[0] = {
        'pol_s': policy_distr.num_samples,
        'pol_t': policy_distr.num_trajects
    }

    dist_rewards = []

    # repeat REIL for given number of rounds
    for i in range(args.rounds):

        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        reward = lambda s: expert_density(s) / (density(s) + args.eps)

        dist_rewards.append(reward)

        start_time = time.time()
        o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        r = reward(o)  # custom reward

        # Main loop: collect experience in env and update/log each epoch
        for epoch in range(epochs):
            for t in range(local_steps_per_epoch):

                a, v_t, logp_t = sess.run(get_action_ops,
                                          feed_dict={x_ph: o.reshape(1, -1)})

                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                o, old_r, d, _ = env.step(a[0])
                r = reward(o)
                ep_ret += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (t == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = old_r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = reward(o)
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    r = reward(o)

            # store model!
            if (epoch == epochs - 1): logger.save_state({'env': env}, None)

            # Perform PPO update!
            update()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts',
                               (epoch + 1) * steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            print(message)

        policy = lambda state: sess.run(
            get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0]
        data[i] = {
            'pol_s': policy_distr.num_samples,
            'pol_t': policy_distr.num_trajects
        }
        data[i]['rewards'] = evaluate_reward(env, policy, data_n)

        if i != args.rounds - 1:
            policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                               args.iter_length)
            density = policy_distr.density()

    return data, dist_rewards
Ejemplo n.º 2
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    maxRev = float("-inf")  #negative infinity in the beginning
    #maxRevActionSeq=[]
    maxRevTSTT = 0
    maxRevRevenue = 0
    maxRevThroughput = 0
    maxRevJAH = 0
    maxRevRemVeh = 0
    maxRevJAH2 = 0
    maxRevRMSE_MLvio = 0
    maxRevPerTimeVio = 0
    maxRevHOTDensity = pd.DataFrame()
    maxRevGPDensity = pd.DataFrame()
    maxtdJAHMax = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu
            numpyFromA = np.array(a[0])
            numpyFromA = ((numpyFromA + 1.0) *
                          (env.state.tollMax - env.state.tollMin) /
                          2.0) + env.state.tollMin
            a[0] = np.ndarray.tolist(numpyFromA)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    #get other stats and store them too
                    otherStats = env.getAllOtherStats()
                    if np.any(np.isnan(np.array(otherStats))):
                        sys.exit("Nan found in statistics! Error")
                    logger.store(EpTSTT=otherStats[0],
                                 EpRevenue=otherStats[1],
                                 EpThroughput=otherStats[2],
                                 EpJAH=otherStats[3],
                                 EpRemVeh=otherStats[4],
                                 EpJAH2=otherStats[5],
                                 EpMLViolRMSE=otherStats[6],
                                 EpPerTimeVio=otherStats[7],
                                 EptdJAHMax=otherStats[8])
                    #determine max rev profile
                    if ep_ret > maxRev:
                        maxRev = ep_ret
                        maxRevActionSeq = env.state.tollProfile
                        maxRevTSTT = otherStats[0]
                        maxRevRevenue = otherStats[1]
                        maxRevThroughput = otherStats[2]
                        maxRevJAH = otherStats[3]
                        maxRevRemVeh = otherStats[4]
                        maxRevJAH2 = otherStats[5]
                        maxRevRMSE_MLvio = otherStats[6]
                        maxRevPerTimeVio = otherStats[7]
                        maxRevHOTDensity = env.getHOTDensityData()
                        maxRevGPDensity = env.getGPDensityData()
                        maxtdJAHMax = otherStats[8]
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpTSTT', average_only=True)
        logger.log_tabular('EpRevenue', average_only=True)
        logger.log_tabular('EpThroughput', average_only=True)
        logger.log_tabular('EpJAH', average_only=True)
        logger.log_tabular('EpRemVeh', average_only=True)
        logger.log_tabular('EpJAH2', average_only=True)
        logger.log_tabular('EpMLViolRMSE', average_only=True)
        logger.log_tabular('EpPerTimeVio', average_only=True)
        logger.log_tabular('EptdJAHMax', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    print("Max cumulative reward obtained= %f " % maxRev)
    print(
        "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f"
        %
        (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh,
         maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax))
    outputVector = [
        maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH,
        maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio,
        maxtdJAHMax
    ]
    #print("\n===Max rev action sequence is\n",maxRevActionSeq)
    exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector)
    exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
Ejemplo n.º 3
0
def ppo(env_fn,
        ref_func=None,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=500,
        epochs=10000,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=500,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    t_a_ph = core.placeholder_from_space(env.action_space)
    ret_ph = core.placeholder(None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, t_a_ph, ret_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    print("---------------", local_steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # dagger objectives
    pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            sess.run(train_pi, feed_dict=inputs)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(1, epochs + 1, 1):
        for t in range(local_steps_per_epoch):
            a_s, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            a = a_s[0]
            ref_a = call_mpc(env, ref_func)
            if (epoch < 100):
                a = ref_a

            # save and log
            buf.store(o, a, ref_a, r)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Ejemplo n.º 4
0
def sigail(env_fn,
           traj_dir,
           actor_critic=core.mlp_actor_critic_add,
           ac_kwargs=dict(),
           d_hidden_size=64,
           seed=0,
           steps_per_epoch=4000,
           epochs=50,
           gamma=0.99,
           clip_ratio=0.2,
           pi_lr=3e-4,
           vf_lr=1e-3,
           train_pi_iters=40,
           train_v_iters=40,
           lam=0.97,
           max_ep_len=4000,
           beta=1e-4,
           target_kl=0.01,
           logger_kwargs=dict(),
           save_freq=100,
           r_env_ratio=0,
           d_itr=20,
           reward_type='negative',
           trj_num=20,
           buf_size=1000,
           si_update_ratio=0.02,
           js_smooth=5,
           buf_update_type='random',
           pretrain_bc_itr=0):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D = Discriminator(env, hidden_size=d_hidden_size,
                      reward_type=reward_type)  #!add Discriminator object
    D_js_m = JS_div_machine(env, hidden_size=d_hidden_size)

    e_obs = np.zeros((buf_size, obs_dim[0]))
    e_act = np.zeros((buf_size, act_dim[0]))
    Sibuffer = SIBuffer(obs_dim,
                        act_dim,
                        e_obs,
                        e_act,
                        trj_num=trj_num,
                        max_size=buf_size,
                        js_smooth_num=js_smooth)  #!sibuf
    trj_full = False
    assert e_obs.shape[1:] == obs_dim
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, pi_std, entropy, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(
        ratio * adv_ph, min_adv)) - beta * entropy  #add entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)  #ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()

    BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  #all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:  #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):  #vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)

        std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs)
        logger.store(
            LossPi=pi_l_old,
            LossV=v_l_old,
            KL=kl,
            Entropy=std_ent,
            ClipFrac=cf,
            DeltaLossPi=(pi_l_new - pi_l_old),  #更新での改善量
            DeltaLossV=(v_l_new - v_l_old),
            Std=std)

    start_time = time.time()
    o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0

    if pretrain_bc_itr > 0:
        BC.learn(Sibuffer.expert_obs,
                 Sibuffer.expert_act,
                 max_itr=pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                '''
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                '''

                #!add discriminator train
                '''#終端も加えるならアリッチャあり
                o_reshape = o.reshape(core.combined_shape(1,obs_dim))
                a_reshape = a.reshape(core.combined_shape(1,act_dim))
                agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド
                agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習
                '''
                agent_obs = buf.obs_buf[buf.path_slice()]
                agent_act = buf.act_buf[buf.path_slice()]

                #D.train(sess,e_obs,e_act ,agent_obs,agent_act)

                #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                if trj_full:
                    gail_r = 1
                else:
                    gail_r = 0
                rew_gail = gail_r * D.get_reward(
                    sess, agent_obs,
                    agent_act).ravel()  #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                ep_ret_gail += rew_gail.sum()  #!before gail_ratio
                ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail

                rew_gail_head = rew_gail[:-1]
                last_val_gail = rew_gail[-1]

                buf.rew_buf[slice(
                    buf.path_start_idx + 1,
                    buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[
                        slice(buf.path_start_idx + 1,
                              buf.ptr)]  #!add GAIL reward 最後の報酬は含まれないため長さが1短い

                if d:  # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r_env_ratio * r + last_val_gail
                else:
                    last_val = sess.run(v,
                                        feed_dict={x_ph: o.reshape(1, -1)
                                                   })  #v_last=...だったけどこれで良さげ

                buf.finish_path(
                    last_val)  #これの前にbuf.finish_add_r_vがなされていることを確認すべし
                if terminal:
                    #only store trajectory to SIBUffer if trajectory finished
                    if trj_full:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    else:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    logger.store(EpRet=ep_ret_task,
                                 EpRet_Sum=ep_ret_sum,
                                 EpRet_Gail=ep_ret_gail,
                                 EpLen=ep_len)

                o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset(
                ), 0, False, 0, 0, 0, 0

        # Save model

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, epoch)

        # Perform PPO update!
        if not (trj_full):
            M_obs_buf = Sibuffer.get_obs_trj()
        trj_full = (M_obs_buf.shape[0] >= buf_size)

        if trj_full:  #replaybufferがr_thresholdよりも大きいとき
            Sibuffer.update_main_buf(ratio_update=si_update_ratio,
                                     update_type=buf_update_type)
            M_obs_buf = Sibuffer.get_obs_trj()
            M_act_buf = Sibuffer.get_act_trj()

            d_batch_size = len(agent_obs)
            for _t in range(d_itr):
                e_obs_batch, e_act_batch = Sibuffer.get_random_batch(
                    d_batch_size)

                D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act)

                D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs,
                             e_act)  #バッファとエキスパートの距離を見るためにtrain
            js_d = D.get_js_div(sess, Sibuffer.main_obs_buf,
                                Sibuffer.main_act_buf, agent_obs, agent_act)
            js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs,
                                       e_act)

        else:
            js_d, js_d_m = 0.5, 0.5
        update()

        Sibuffer.store_js(js_d)
        logger.store(JS=js_d,
                     JS_M=js_d_m,
                     JS_Ratio=Sibuffer.js_ratio_with_random)

        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('buffer_r', Sibuffer.buffer_r_average)
        logger.log_tabular('JS', average_only=True)
        logger.log_tabular('JS_M', average_only=True)
        logger.log_tabular('JS_Ratio', average_only=True)
        logger.dump_tabular()
Ejemplo n.º 5
0
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10, custom_h=None, eval_episodes=50,
        do_checkpoint_eval=False, env_name=None, eval_temp=1.0, train_starting_temp=1.0,
        env_version=None, env_input=None, target_arcs=None):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # create logger for tensorboard
    tb_logdir = "{}/tb_logs/".format(logger.output_dir)
    tb_logger = Logger(log_dir=tb_logdir)

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    if custom_h is not None:
        hidden_layers_str_list = custom_h.split('-')
        hidden_layers_int_list = [int(h) for h in hidden_layers_str_list]
        ac_kwargs['hidden_sizes'] = hidden_layers_int_list

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    temperature_ph = tf.placeholder(tf.float32, shape=(), name="init")

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, temperature_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, temperature_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v) ** 2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the
    # whole GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # log tf graph
    tf.summary.FileWriter(tb_logdir, sess.graph)

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'temperature': temperature_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, []

    # initialize variables for keeping track of BEST eval performance
    best_eval_AverageEpRet = -0.05  # a negative value so that best model is saved at least once.
    best_eval_StdEpRet = 1.0e30

    # save is used to only allow saving BEST models after half of training epochs
    save = True

    # below are used for early-stop. We early stop if
    # 1) a best model has been saved, and,
    # 2) 50 epochs have passed without a new save
    saved = False
    early_stop_count_started = False
    episode_count_after_saved = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        current_temp = _get_current_temperature(epoch, epochs, train_starting_temp)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1),
                                                                 temperature_ph: current_temp})

            # save and log
            buf.store(o, a, r, v_t, logp_t, current_temp)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            if env_version >= 4 and env.action_is_dummy:  # a is dummy action
                ep_dummy_action_count += 1
                ep_dummy_steps_normalized.append(ep_len / env.allowed_steps)

            terminal = d or (ep_len == max_ep_len)

            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1),
                                                              temperature_ph: current_temp})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    if env_version >= 4:
                        logger.store(EpDummyCount=ep_dummy_action_count)
                        logger.store(EpTotalArcs=env.adjacency_matrix.sum())
                        if len(ep_dummy_steps_normalized) > 0:
                            ep_dummy_steps_normalized = np.asarray(ep_dummy_steps_normalized, dtype=np.float32).mean()
                            logger.store(EpDummyStepsNormalized=ep_dummy_steps_normalized)

                o, r, d, ep_ret, ep_len, ep_dummy_action_count, ep_dummy_steps_normalized = env.reset(), 0, False, 0, 0, 0, []

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save.
            # logger.save_state({'env_name': env_name}, epoch)

            # # Save a new model every save_freq and at the last epoch. Only keep one copy - the current model
            # logger.save_state({'env_name': env_name})


            # Evaluate and save best model
            if do_checkpoint_eval and epoch > 0:
                # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999.
                # Doing this way, I can use test_policy and plot directly to test the best models.
                # saved best models includes:
                # 1) a copy of the env_name
                # 2) the best rl model with parameters
                # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch
                # note that 1) and 2) are spinningup defaults, and 3) is a custom save
                best_eval_AverageEpRet, best_eval_StdEpRet, saved = eval_and_save_best_model(
                    best_eval_AverageEpRet,
                    best_eval_StdEpRet,
                    # a new logger is created and passed in so that the new logger can leverage the directory
                    # structure without messing up the logger in the training loop
                    eval_logger=EpochLogger(**dict(
                        exp_name=logger_kwargs['exp_name'],
                        output_dir=os.path.join(logger.output_dir, "simple_save999999"))),

                    train_logger=logger,
                    tb_logger=tb_logger,
                    epoch=epoch,
                    # the env_name is passed in so that to create an env when and where it is needed. This is to
                    # logx.save_state() error where an env pointer cannot be pickled
                    env_name="F{}x{}T{}_SP{}_v{}".format(env.n_plant, env.n_product, env.target_arcs, env.n_sample,
                                                         env_version) if env_version >= 3 else env_name,
                    env_version=env_version,
                    env_input=env_input,
                    render=False,  # change this to True if you want to visualize how arcs are added during evaluation
                    target_arcs=env.target_arcs,
                    get_action=lambda x: sess.run(pi, feed_dict={x_ph: x[None, :],
                                                                 temperature_ph: eval_temp})[0],
                    # number of samples to draw when simulate demand
                    n_sample=5000,
                    num_episodes=eval_episodes,
                    save=save,
                    seed=seed
                )

        # Perform PPO update!
        update()

        # # # Log into tensorboard
        log_key_to_tb(tb_logger, logger, epoch, key="EpRet", with_min_and_max=True)
        log_key_to_tb(tb_logger, logger, epoch, key="EpLen", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="VVals", with_min_and_max=True)
        log_key_to_tb(tb_logger, logger, epoch, key="LossPi", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="LossV", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossPi", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="DeltaLossV", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="Entropy", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="KL", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="ClipFrac", with_min_and_max=False)
        log_key_to_tb(tb_logger, logger, epoch, key="StopIter", with_min_and_max=False)
        tb_logger.log_scalar(tag="TotalEnvInteracts", value=(epoch + 1) * steps_per_epoch, step=epoch)
        tb_logger.log_scalar(tag="Time", value=time.time() - start_time, step=epoch)
        tb_logger.log_scalar(tag="epoch_temp", value=current_temp, step=epoch)
        if env_version >= 4:
            log_key_to_tb(tb_logger, logger, epoch, key="EpDummyCount", with_min_and_max=False)
            log_key_to_tb(tb_logger, logger, epoch, key="EpTotalArcs", with_min_and_max=False)

            if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0:
                log_key_to_tb(tb_logger, logger, epoch, key="EpDummyStepsNormalized", with_min_and_max=False)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('EpochTemp', current_temp)
        if env_version >= 4:
            logger.log_tabular('EpDummyCount', with_min_and_max=True)
            if len(logger.epoch_dict['EpDummyStepsNormalized']) > 0:
                logger.log_tabular('EpDummyStepsNormalized', average_only=True)
            logger.log_tabular('EpTotalArcs', average_only=True)

        logger.dump_tabular()

        # check for early stop
        if saved:
            # start to count the episodes elapsed after a "saved" event
            early_stop_count_started = True

            # reset the count to 0
            episode_count_after_saved = 0

        else:
            # check whether we should count this episode, i.e., whether early_stop_count_started == True
            if early_stop_count_started:
                episode_count_after_saved += 1
                if episode_count_after_saved > 60:
                    logger.log('Early Stopped at epoch {}.'.format(epoch), color='cyan')
                    break
Ejemplo n.º 6
0
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4,
        target_kl=0.01, logger_kwargs=dict(), save_freq=100,
        r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative',
        pretrain_bc_itr =0):
    """

    additional args
    d_hidden_size : hidden layer size of Discriminator
    d_batch_size : Discriminator's batch size

    r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment
    
    d_itr : The number of iteration of update discriminater 
    reward_type : GAIL reward has three type ['negative','positive', 'AIRL']
    trj_num :the number of trajectory for 
    pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing
    
    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type)
    
    
    e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',')
    e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory

    Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf

    assert e_obs.shape[1:] == obs_dim 
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()


    BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())


    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):#vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        
        std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=std_ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量
                     DeltaLossV=(v_l_new - v_l_old),
                     Std = std)

    start_time = time.time()
    o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0


    if pretrain_bc_itr>0:
        BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            buf.store_rew(r)
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if d:# if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r 
                else:
                    last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ
            
                buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use
                buf.finish_path()
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail)
        
                o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0

        # Save model
        
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, epoch)


        agent_obs , agent_act = buf.obs_buf, buf.act_buf

        d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator
        for _t in range(d_itr):
            e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size)
            a_obs_batch  =sample_batch(agent_obs,batch_size = d_batch_size)
            a_act_batch= sample_batch(agent_act,batch_size = d_batch_size)
            D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch )
        js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act)
        #---------------get_gail_reward------------------------------
        rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel()

        buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf
        for path_slice in buf.slicelist[:-1]:
            ep_ret_gail = rew_gail[path_slice].sum()
            ep_ret_sum = buf.rew_buf[path_slice].sum()
            logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail)


        buf.culculate_adv_buf()
        
        # -------------Perform PPO update!--------------------

        update()
        
        logger.store(JS=js_d)


        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('JS', average_only=True)
        #logger.log_tabular('JS_Ratio', average_only=True)    
        logger.dump_tabular()
Ejemplo n.º 7
0
Archivo: ppo.py Proyecto: Ximeng12/ppo
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()  # game environment
    obs_dim = env.observation_space.shape  # get the observe dimension from environment
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(
        env.observation_space, env.action_space)  #构建神经网络的时候,a_ph还没有
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)  #目前这里的状态和action都还是放的placeholder

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action)
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))  # 两部分的loss
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # 同步参数
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # 前面把计算图构建好了
    def update():
        # 把input形成字典,等下便于使用
        # 通过搜集到的数据,进行梯度下降,更新参数
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # 上部分的train是policy,这部分是值函数
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # 主循环
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            # 把数据放进 buffer pool 里
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)
            # o 应该代表observation
            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # 打完一局游戏,执行一次更新
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def main(env_fn,
         traj_dir,
         actor_critic=core.mlp_actor_critic,
         bc_itr=1000,
         ac_kwargs=dict(),
         d_hidden_size=64,
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         pi_lr=3e-4,
         vf_lr=1e-3,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=4000,
         target_kl=0.01,
         save_freq=100,
         r_env_ratio=0,
         reward_type='negative',
         trj_num=30,
         buf_size=None,
         si_update_ratio=0.02,
         js_threshold_ratio=0.5,
         js_smooth=5):
    """
    test behavior cloning
    """

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D = Discriminator(env,
                      hidden_size=d_hidden_size)  #!add Discriminator object
    D_js_m = JS_div_machine(env, hidden_size=d_hidden_size)

    e_obs = np.loadtxt(traj_dir + '/observations.csv', delimiter=',')
    e_act = np.loadtxt(traj_dir + '/actions.csv',
                       delimiter=',')  #Demo treajectory

    Sibuffer = SIBuffer(obs_dim,
                        act_dim,
                        e_obs,
                        e_act,
                        trj_num=trj_num,
                        max_size=buf_size,
                        js_smooth_num=js_smooth)  #!sibuf

    assert e_obs.shape[1:] == obs_dim
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])

    sess = tf.Session()

    BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph)

    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    BC.learn(Sibuffer.expert_obs, Sibuffer.expert_act, max_itr=bc_itr)
    # Sync params across processes
    start_time = time.time()
    o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0
    # Setup model saving

    for epoch in range(1000000):
        a, v_t, logp_t = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})

        o, r, d, _ = env.step(a[0])
        env.render()
        time.sleep(1e-3)

        ep_ret_task += r
        ep_len += 1

        terminal = d or (ep_len == max_ep_len)
        if terminal:
            print('EpRet{},EpLen{}'.format(ep_ret_task, ep_len))
            o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset(
            ), 0, False, 0, 0, 0, 0
Ejemplo n.º 9
0
def ppo(env_fn,
  # by default, use the neural network mlp we define in core
  actor_critic=core.mlp_actor_critic,
  ac_kwargs=dict(),
  seed=0,
  steps_per_epoch=4000,
  epochs=50,
  gamma=0.99,
  clip_ratio=0.2,
  pi_lr=3e-4,
  vf_lr=1e-3,
  train_pi_iters=80,
  train_v_iters=80,
  lam=0.97,
  max_ep_len=1000,
  target_kl=0.01,
  logger_kwargs=dict(),
  save_freq=10):
  """
  "Args:
  env_fn: A function which creates a copy of the environment.
  The environment must satisfy the OpenAI Gym API.

  actor_critic: A function with takes in placeholder symbols
  for state, ``x_ph``, and action ``a_ph``, and returns the main
  outputs from the agent's Tensorflow computation graph:

  ===========  ================  ======================================
  Symbol       Shape             Description
  ===========  ================  ======================================
  ``pi``       (batch, act_dim)  | Samples actions from policy given states.
  ``logp``     (batch,)          | Gives log probability according to
                                  | the policy, of taking actions ``a_ph``
                                  | in states ``x_ph``.
  ``logp_pi``  (batch,)          | Gives log probability, according to
                                  | the policy, of the action sampled by ``pi``.
  ``v``        (batch,)          | Gives the value estimate for states
                                  | in ``x_ph``.  (Critical: make sure
                                  | to flatten this!)
  ===========  ================  ======================================" -OpenAI
  Okay, quick interruption to OpenAI documentation here.
  actor_critic is the function which interfaces with tensorflow.  It takes in
  ``x_ph`` (x placeholder), ie. a representation of the current state, and
  ``a_ph``, a representation of the some actions.  (TODO: document
  *what* these actions are).
  actor_critic runs these inputs through the tensorflow graph and returns several
  pieces of information that are relevant to PPO; these are described above.

  Back to OpenAI:
  "
  ac_kwargs (dict): Any kwargs appropriate for actor_critic function
      you provided to PPO.

  seed (int): Seed for random number generators.

  setps_per_epoch (int): Number of steps of interaction (state-action pairs)
      for the agent and the environment in each epoch.

  epochs (int): Number of epochs of interaction (equivalent to
      number of policy updates) to perform.

  gamma (float): Discount factor. (Always between 0 and 1.)

  clip_ratio (float): Hyperparameter for clipping in the policy objective.
      Roughly: how far can the new policy go from the old policy while
      still profiting (improving the objective function)? The new policy
      can still go farther than the clip_ratio says, but it doesn't help
      on the objective anymore.  (Usually small, 0.1 to 0.3.)

  pi_lr (float): Learning rate for policy optimizer.

  vf_lr (float): Learning rate for value function optimizer.

  train_pi_iters (int): Maximum number of gradient descent steps to take
      on policy loss per epoch.  (Early stopping may cause optimizer
      to take fewer than this.)

  train_v_iters (int): Number of gradient descent steps to take on
      value funciton per epoch.

  lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
      close to 1).

  max_ep_len (int): Maximum length of trajectory / episode / rollout.

  target_kl (float): Roughly what KL divergence we think is appropriate
      between new and old policies after an update.  This will get used
      for early stopping.  (Usually small, 0.01 or 0.05.)

  logger_kwargs (dict): Keyword args for EpochLogger.

  save_freq (int): How often (in terms of gap between epochs) to save
      the current policy and value function." - OpenAI
  """
  logger = EpochLogger(**logger_kwargs)
  logger.save_config(locals())

  # modify the seed based on the process so if
  # we run this in multiple processes
  # simultaneously we don't do the
  # exact same thing
  seed += 10000 * proc_id()
  # set up our random stuff with this seed
  tf.set_random_seed(seed)
  np.random.seed(seed)

  # create the environment
  env = env_fn()
  obs_dim = env.observation_space.shape
  act_dim = env.action_space.shape

  # tell the policy (implemented in actor_critic function) what the action space is
  ac_kwargs['action_space'] = env.action_space

  # "Inputs to computation graph" -OpenAI
  # create tensorflow placeholders for observations (x_ph), actions (a_ph),
  # advantages (adv_ph), returns (ret_ph), log probabilities
  # in the current state of the policy (logp_old_ph)
  # (old since this is used compared to the newer version of the policy
  # we are creating in the optimization step, comparing to this "old" version)
  x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
  adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

  # "Main outputs from computation graph" -OpenAI
  # essentially here we fill in the tensorflow graph so we can compute
  # the pi, logp, logp_pi, and v tensors based on the
  # x_ph and a_ph we created above
  pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

  # "Need all placeholders in *this* order later (to zip with data from buffer)" -OpenAI
  all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

  # "Every step, get: action, value, and logprob" -OpenAI
  # we later feed this list into tf.session.run()
  # to tell it to compute the value of pi, v, logp_pi
  # using the tensorflow graph we have created
  get_action_ops = [pi, v, logp_pi]

  # Experience buffer

  # number of steps per epoch per process
  local_steps_per_epoch = int(steps_per_epoch / num_procs())

  buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

  # Count the number of parameters we are gonna be training,
  # both for the policy and for the value function
  var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
  logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

  # PPO objectives
  # ratio is the ratio of two probabilities:
  # pi(a|s) / pi_old(a|s)
  # where pi(a|s) is the probability of performing action a
  # given state s GIVEN THE POLICY WHOSE PARAMETERS WE ARE CHANGING
  # DURING THE OPTIMIZATION STEP
  # and pi_old(a|s) is the probability of the policy,
  # with fixed mlp parameters after the last update,
  # performing a given state s

  # we essentially use math to find the gradient of pi(a|s) with respect
  # to the parameters of the mlp, and this is the core of how we calculate
  # the gradient of the objective function for gradient descent

  ratio = tf.exp(logp - logp_old_ph) # "pi(a|s) / pi_old(a|s)"-OpenAI

  # this min_adv, along with the tf.minimum call in the next line of code,
  # implement the PPO-clip functionality

  # NOTE: calling this `min_adv` is a bit confusing; if advantage is negative
  # this is the min value we allow the gradient descent to consider as the advantage;
  # but it is the MAX value if advantage is positive.
  min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)

  # create the functions whose gradients we wish to use for gradient descent
  # during optimization
  # for our policy optimization, it is the PPO objective; 
  # for the value function it is simply an error-squared
  # note that reduce_mean just calculates the mean of the values in the tensor;
  # ie. this gives the expected value of the loss given the experimental values we have
  pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
  v_loss = tf.reduce_mean((ret_ph - v) ** 2)

  # Info (useful to watch during learning)
  approx_kl = tf.reduce_mean(logp_old_ph - logp) # "a sample estimate for KL-divergence, easy to compute" -OpenAI
  approx_ent = tf.reduce_mean(-logp) # "a sample estimate for entropy, also easy to compute" -OpenAI
  clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
  clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # what fraction of advantages are clipped

  # Optimizers
  # These use gradient descent with the gradient of the objective
  # functions we defined above to improve parameters for pi and v
  train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
  train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

  # initialize the tensorflow computation graph's parameters
  # with values
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())

  # "Sync params across processes" -OpenAI
  sess.run(sync_all_params())

  # Setup model saving
  logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

  def update():
    # create a dictionary of values, which specify to tensorflow what
    # to input for the placeholders: tensors containing the data from
    # the trajectory we have stored in buf
    inputs = {k:v for k, v in zip(all_phs, buf.get())}

    # calculate these for logging later
    pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

    # Training
    for i in range(train_pi_iters):
      # run a training step for the policy, and estimate the kl-divergence
      # (ie. how much the policy changed) on this step
      _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
      kl = mpi_avg(kl)

      # if the kl divergence is too high, stop training on this step
      # TODO: understand better why it is important to do this
      if kl > 1.5 * target_kl:
        logger.log('Early stopping at step %d due to reaching max kl.'%i)
        break

    logger.store(StopIter=i)

    # train our value function mlp
    for _ in range(train_v_iters):
      sess.run(train_v, feed_dict=inputs)

    # "Log changes from update" -OpenAI
    # TODO: This could be made a bit more computationally efficient by not recalculating pi_l_old each loop
    # after having calculated the same thing as pi_l_new the previous run through the loop!
    # Plus, does it really make the most sense to output pi_l_old and v_l_old as LossPi and LossV
    # instead of pi_l_new and v_l_new?
    pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
    logger.store(LossPi=pi_l_old, LossV=v_l_old,
        KL=kl, Entropy=ent, ClipFrac=cf,
        DeltaLossPi=(pi_l_new - pi_l_old),
        DeltaLossV=(v_l_new - v_l_old))
    

  start_time = time.time()

  # initialize the variables we use while training
  # o = observation (env.reset() returns initial observation)
  # r = reward = (starts as 0)
  # d = done? (whether current episode in env is over)
  # ep_ret = episode return
  # ep_len = length of episode so far
  o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

  # "Main loop: collect experience in env and update/log each epoch"
  for epoch in range(epochs):
    for t in range(local_steps_per_epoch):
      
      # run the computation of the action, value function, and probability of the action
      # using the most recent observation in the x_ph slot
      a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

      # save and log
      buf.store(o, a, r, v_t, logp_t)
      logger.store(VVals=v_t)

      # take the action we computed and advance the environment
      o, r, d, _ = env.step(a[0])
      ep_ret += r
      ep_len += 1

      terminal = d or (ep_len == max_ep_len)
      if terminal or (t==local_steps_per_epoch - 1):
        if not terminal:
          print('Warning: trajectory cut off by epoch at %d steps'%ep_len)
        
        # "if trajectory didn't reach terminal state, bootstrap value target" -OpenAI
        # in other words, if the we are stopping this trajectory due to a termination
        # signal from the env, last_val = the reward from the last step, r
        # otherwise we stopped because we reached the max episode length or max local_steps_per_epoch,
        # in which ase we set last_val = estimate of the value of current state based on v function 
        # we are training
        last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
        
        buf.finish_path(last_val)

        # "only store EpRet / EpLen if trajectory finished" -OpenAI
        if terminal:
          logger.store(EpRet=ep_ret, EpLen=ep_len)

        # reset our training variables and the training environment
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # every save_freq epochs,
    # save the state of the environment
    # also save the current state of our value function model
    # and policy
    # these are automatically saved by the save_state function
    # since we have already called logger.setup_tf_saver
    if (epoch % save_freq == 0) or (epoch == epochs - 1):
      logger.save_state({'env': env}, None)

    # "Perform PPO update!"
    update()
    
    # "Log info about epoch"
    logger.log_tabular('Epoch', epoch)
    try:
      logger.log_tabular('EpRet', with_min_and_max=True)
      logger.log_tabular('EpLen', average_only=True)
    except:
      pass
    logger.log_tabular('VVals', with_min_and_max=True)
    logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
    logger.log_tabular('LossPi', average_only=True)
    logger.log_tabular('LossV', average_only=True)
    logger.log_tabular('DeltaLossPi', average_only=True)
    logger.log_tabular('DeltaLossV', average_only=True)
    logger.log_tabular('Entropy', average_only=True)
    logger.log_tabular('KL', average_only=True)
    logger.log_tabular('ClipFrac', average_only=True)
    logger.log_tabular('StopIter', average_only=True)
    logger.log_tabular('Time', time.time() - start_time)
    logger.dump_tabular()
def ppo(env_fn,
        expert=None,
        policy_path=None,
        actor_critic=core.mlp_actor_critic_m,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=10000,
        dagger_epochs=500,
        pretrain_epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=1e-4,
        dagger_noise=0.01,
        batch_size=64,
        replay_size=int(5e3),
        vf_lr=1e-4,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.999,
        max_ep_len=500,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        test_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        policy_path (str): path of pretrained policy model
            train from scratch if None

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    test_logger_kwargs = dict()
    test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'],
                                                "test")
    test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    test_logger = EpochLogger(**test_logger_kwargs)
    test_logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    act_high_limit = env.action_space.high
    act_low_limit = env.action_space.low

    sess = tf.Session()
    if policy_path is None:
        # Inputs to computation graph
        x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                                   env.action_space)
        adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
        tfa_ph = core.placeholder(act_dim)

        # Main outputs from computation graph
        mu, pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
        sess.run(tf.global_variables_initializer())

    else:
        # load pretrained model
        # sess, x_ph, a_ph, mu, pi, logp, logp_pi, v = load_policy(policy_path, itr='last', deterministic=False, act_high=env.action_space.high)
        # # get_action_2 = lambda x : sess.run(mu, feed_dict={x_ph: x[None,:]})[0]
        # adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
        model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save'))
        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = model['x_ph'], model[
            'a_ph'], model['adv_ph'], model['ret_ph'], model['logp_old_ph']
        mu, pi, logp, logp_pi, v = model['mu'], model['pi'], model[
            'logp'], model['logp_pi'], model['v']
        # tfa_ph = core.placeholder(act_dim)
        tfa_ph = model['tfa_ph']

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    print("---------------", local_steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)
    # print(obs_dim)
    # print(act_dim)
    dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim[0],
                                              act_dim=act_dim[0],
                                              size=replay_size)
    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    if policy_path is None:
        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                           (1 - clip_ratio) * adv_ph)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
        v_loss = tf.reduce_mean((ret_ph - v)**2)
        dagger_pi_loss = tf.reduce_mean(tf.square(mu - tfa_ph))

        # Info (useful to watch during learning)
        approx_kl = tf.reduce_mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute
        approx_ent = tf.reduce_mean(
            -logp)  # a sample estimate for entropy, also easy to compute
        clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio <
                                (1 - clip_ratio))
        clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

        # Optimizers
        dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
        optimizer_pi = tf.train.AdamOptimizer(learning_rate=pi_lr)
        optimizer_v = tf.train.AdamOptimizer(learning_rate=vf_lr)
        train_dagger_pi_op = dagger_pi_optimizer.minimize(
            dagger_pi_loss, name='train_dagger_pi_op')
        train_pi = optimizer_pi.minimize(pi_loss, name='train_pi_op')
        train_v = optimizer_v.minimize(v_loss, name='train_v_op')

        sess.run(tf.variables_initializer(optimizer_pi.variables()))
        sess.run(tf.variables_initializer(optimizer_v.variables()))
        sess.run(tf.variables_initializer(dagger_pi_optimizer.variables()))
    else:
        graph = tf.get_default_graph()
        dagger_pi_loss = model['dagger_pi_loss']
        pi_loss = model['pi_loss']
        v_loss = model['v_loss']
        approx_ent = model['approx_ent']
        approx_kl = model['approx_kl']
        clipfrac = model['clipfrac']

        train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op')
        train_pi = graph.get_operation_by_name('train_pi_op')
        train_v = graph.get_operation_by_name('train_v_op')
    # sess = tf.Session()
    # sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # sess.run(sync_all_params())

    tf.summary.FileWriter("log/", sess.graph)
    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'adv_ph': adv_ph, 'ret_ph': ret_ph, 'logp_old_ph': logp_old_ph}, \
        outputs={'mu': mu, 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi, 'clipfrac': clipfrac, 'approx_kl': approx_kl, \
            'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'approx_ent': approx_ent})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    def choose_action(s, add_noise=False):
        s = s[np.newaxis, :]
        a = sess.run(mu, {x_ph: s})[0]
        if add_noise:
            noise = dagger_noise * act_high_limit * np.random.normal(
                size=a.shape)
            a = a + noise
        return np.clip(a, act_low_limit, act_high_limit)

    def test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, info = env.step(choose_action(np.array(o), 0))
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(
                        arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        # time.sleep(10)
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    def ref_test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = call_ref_controller(env, expert)
                o, r, d, info = env.step(a)
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(
                        arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    ref_test_agent(test_num=-1)
    test_logger.log_tabular('epoch', -1)
    test_logger.log_tabular('TestEpRet', average_only=True)
    test_logger.log_tabular('TestEpLen', average_only=True)
    test_logger.log_tabular('arrive_des', average_only=True)
    test_logger.log_tabular('arrive_des_appro', average_only=True)
    test_logger.log_tabular('converge_dis', average_only=True)
    test_logger.log_tabular('out_of_range', average_only=True)
    test_logger.dump_tabular()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    test_policy_epochs = 91
    episode_steps = 500
    total_env_t = 0
    test_num = 0
    print(colorize("begin dagger training", 'green', bold=True))
    for epoch in range(1, dagger_epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs):
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        obs, acs, rewards = [], [], []
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            # a = get_action_2(np.array(o))
            # save and log
            obs.append(o)
            ref_action = call_ref_controller(env, expert)
            if (epoch < pretrain_epochs):
                action = ref_action
            else:
                action = choose_action(np.array(o), True)

            buf.store(o, action, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(action)
            acs.append(ref_action)
            rewards.append(r)

            ep_ret += r
            ep_len += 1
            total_env_t += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Perform dagger and partical PPO update!
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        max_step = len(np.array(rewards))
        dagger_replay_buffer.stores(obs, acs, rewards)
        for _ in range(int(local_steps_per_epoch / 10)):
            batch = dagger_replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']}
            q_step_ops = [dagger_pi_loss, train_dagger_pi_op]
            for j in range(10):
                outs = sess.run(q_step_ops, feed_dict)
            logger.store(LossPi=outs[0])

        c_v_loss = sess.run(v_loss, feed_dict=inputs)
        logger.store(LossV=c_v_loss,
                     KL=0,
                     Entropy=0,
                     ClipFrac=0,
                     DeltaLossPi=0,
                     DeltaLossV=0,
                     StopIter=0)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

    # Main loop: collect experience in env and update/log each epoch
    print(colorize("begin ppo training", 'green', bold=True))
    for epoch in range(1, epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch
                                                      == epochs) or epoch == 1:
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            # a = a[0]
            # a = get_action_2(np.array(o))
            # a = np.clip(a, act_low_limit, act_high_limit)
            # if epoch < pretrain_epochs:
            #     a = env.action_space.sample()
            # a = np.clip(a, act_low_limit, act_high_limit)
            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Ejemplo n.º 11
0
    def __init__(self, args={}):
        self.bot = None
        if "bot" in args:
            bot = args["bot"]

        self.epoch = 0
        self.step = 0

        self.actor_critic = core.mlp_actor_critic
        self.ac_kwargs = dict(hidden_sizes=[64] * 2)
        self.seed = 0
        self.steps_per_epoch = 10000
        self.epochs = 10
        self.gamma = 0.99
        self.clip_ratio = 0.2
        self.pi_lr = 3e-4
        self.vf_lr = 1e-3
        self.train_pi_iters = 80
        self.train_v_iters = 80
        self.lam = 0.97
        self.max_ep_len = 1000
        self.target_kl = 0.01
        self.logger_kwargs = {}
        self.save_freq = 1

        map_name = "unknown"
        if bot is not None:
            map_name = bot.map_name
        self.logger_kwargs = {
            "output_dir": f".\\{map_name}\\ai_data",
            "exp_name": "builder_ai"
        }

        self.logger = EpochLogger(**self.logger_kwargs)

        #self.logger.save_config(locals())
        self.logger.save_config(self.__dict__)

        seed = self.seed
        seed += 10000 * proc_id()
        tf.set_random_seed(seed)
        np.random.seed(seed)

        #env = env_fn()
        self.env = BuilderEnv(args={"bot": self.bot})
        obs_dim = self.env.observation_space.shape
        act_dim = self.env.action_space.shape

        # Share information about action space with policy architecture
        self.ac_kwargs['action_space'] = self.env.action_space

        print(str(self.env.observation_space))
        print(str(self.env.action_space))

        print(str(type(self.env.observation_space)))
        print(str(type(self.env.action_space)))

        # Inputs to computation graph
        self.x_ph, self.a_ph = core.placeholders_from_spaces(
            self.env.observation_space, self.env.action_space)
        self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders(
            None, None, None)

        # Main outputs from computation graph
        self.pi, self.logp, self.logp_pi, self.v = self.actor_critic(
            self.x_ph, self.a_ph, **self.ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        self.all_phs = [
            self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph
        ]

        # Every step, get: action, value, and logprob
        self.get_action_ops = [self.pi, self.v, self.logp_pi]

        # Experience buffer
        self.local_steps_per_epoch = int(self.steps_per_epoch / num_procs())
        self.buf = ppo.PPOBuffer(
            obs_dim, act_dim, self.local_steps_per_epoch, self.gamma, self.lam
        )  # *2 is to create a lot of extra space in the buffer, hopefully?

        # Count variables
        var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                        var_counts)

        # PPO objectives
        self.ratio = tf.exp(self.logp -
                            self.logp_old_ph)  # pi(a|s) / pi_old(a|s)
        self.min_adv = tf.where(self.adv_ph > 0,
                                (1 + self.clip_ratio) * self.adv_ph,
                                (1 - self.clip_ratio) * self.adv_ph)
        self.pi_loss = -tf.reduce_mean(
            tf.minimum(self.ratio * self.adv_ph, self.min_adv))
        self.v_loss = tf.reduce_mean((self.ret_ph - self.v)**2)

        # Info (useful to watch during learning)
        self.approx_kl = tf.reduce_mean(
            self.logp_old_ph -
            self.logp)  # a sample estimate for KL-divergence, easy to compute
        self.approx_ent = tf.reduce_mean(
            -self.logp)  # a sample estimate for entropy, also easy to compute
        self.clipped = tf.logical_or(self.ratio > (1 + self.clip_ratio),
                                     self.ratio < (1 - self.clip_ratio))
        self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32))

        print(f"pi_lr:{self.pi_lr}, pi_loss:{self.pi_loss}")

        # Optimizers
        self.train_pi = MpiAdamOptimizer(learning_rate=self.pi_lr).minimize(
            self.pi_loss)
        self.train_v = MpiAdamOptimizer(learning_rate=self.vf_lr).minimize(
            self.v_loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        # Sync params across processes
        self.sess.run(sync_all_params())

        # Setup model saving
        self.logger.setup_tf_saver(self.sess,
                                   inputs={'x': self.x_ph},
                                   outputs={
                                       'pi': self.pi,
                                       'v': self.v
                                   })

        self.start_time = time.time()
        self.o, self.r, self.d, self.ep_ret, self.ep_len = self.env.reset(
            args={}), 0, False, 0, 0

        print(f"o:{self.o}, type:{type(self.o)}")

        self.epoch = 0
        self.t = 0

        self.load()