コード例 #1
0
ファイル: train_mnist.py プロジェクト: 2016choang/spinningup
def train_mnist(steps_per_epoch=100, epochs=5,
                lr=1e-3, layers=2, hidden_size=64,
                logger_kwargs=dict(), save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Load and preprocess MNIST data
    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    x_train = x_train.reshape(-1, 28 * 28) / 255.0

    # Define inputs & main outputs from computation graph
    x_ph = tf.placeholder(tf.float32, shape=(None, 28 * 28))
    y_ph = tf.placeholder(tf.int32, shape=(None,))
    logits = mlp(x_ph, hidden_sizes=[hidden_size] * layers + [10], activation=tf.nn.relu)
    predict = tf.argmax(logits, axis=1, output_type=tf.int32)

    # Define loss function, accuracy, and training op
    y = tf.one_hot(y_ph, 10)
    loss = tf.losses.softmax_cross_entropy(y, logits)
    acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
    train_op = tf.train.AdamOptimizer().minimize(loss)

    # Prepare session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph},
                          outputs={'logits': logits, 'predict': predict})

    start_time = time.time()

    # Run main training loop
    for epoch in range(epochs):
        for t in range(steps_per_epoch):
            idxs = np.random.randint(0, len(x_train), 32)
            feed_dict = {x_ph: x_train[idxs],
                         y_ph: y_train[idxs]}
            outs = sess.run([loss, acc, train_op], feed_dict=feed_dict)
            logger.store(Loss=outs[0], Acc=outs[1])

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state(state_dict=dict(), itr=None)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('Acc', with_min_and_max=True)
        logger.log_tabular('Loss', average_only=True)
        logger.log_tabular('TotalGradientSteps', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #2
0
def tac(env_fn,
        actor_critic=core.mlp_q_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=200,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        alpha_schedule='constant',
        q=1.0,
        q_schedule='constant',
        pdf_type='gaussian',
        log_type='q-log',
        batch_size=100,
        start_steps=10000,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):

    alpha = Alpha(alpha_start=alpha, schedule=alpha_schedule)
    entropic_index = EntropicIndex(q_end=q, schedule=q_schedule)

    alpha_t = alpha()
    q_t = entropic_index()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['pdf_type'] = pdf_type
    ac_kwargs['log_type'] = log_type

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)
    # Coefficient for Tsallis entropy
    alpha_ph = core.scale_holder()
    # Place holder for entropic index
    q_ph = core.entropic_index_holder()

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(
            x_ph, a_ph, q_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, q_ph,
                                                   **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph,
                              'q': q_ph,
                              'alpha': alpha_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1), q_ph: q_t})

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a = get_action(o, True)
                o, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):

            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    alpha_ph: alpha_t,
                    q_ph: q_t
                }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             LossV=outs[3],
                             Q1Vals=outs[4],
                             Q2Vals=outs[5],
                             VVals=outs[6],
                             LogPi=outs[7])

                if np.isnan(outs[0]) or np.isnan(outs[7]).any():
                    print(outs)
                    return

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EntIndex', q_t)
            logger.log_tabular('EntCoeff', alpha_t)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Update alpha and q value
            alpha_t = alpha()
            q_t = entropic_index()
コード例 #3
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space below.                     #
    #                                                                         #
    #=========================================================================#

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)
    # Target Q networks
    with tf.variable_scope('target', reuse=True):
        # Target policy smoothing, by adding clipped noise to target actions
        pi_noise_targ = get_pi_noise_clipped(pi,
                                             noise_scale=target_noise,
                                             noise_clip=noise_clip,
                                             act_limit=act_limit)
        # Target Q-values, using action from smoothed target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, pi_noise_targ,
                                              **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    q_targ = get_q_target(q1_targ, q2_targ, r_ph, d=d_ph, gamma=0.99)
    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.losses.mean_squared_error(q_targ, q1)
    q2_loss = tf.losses.mean_squared_error(q_targ, q2)
    q_loss = q1_loss + q2_loss

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space above.                     #
    #                                                                         #
    #=========================================================================#

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #4
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    maxRev = float("-inf")  #negative infinity in the beginning
    #maxRevActionSeq=[]
    maxRevTSTT = 0
    maxRevRevenue = 0
    maxRevThroughput = 0
    maxRevJAH = 0
    maxRevRemVeh = 0
    maxRevJAH2 = 0
    maxRevRMSE_MLvio = 0
    maxRevPerTimeVio = 0
    maxRevHOTDensity = pd.DataFrame()
    maxRevGPDensity = pd.DataFrame()
    maxtdJAHMax = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            #we need to scale the sampled values of action from (-1,1) to our choices of toll coz they were sampled from tanh activation mu
            numpyFromA = np.array(a[0])
            numpyFromA = ((numpyFromA + 1.0) *
                          (env.state.tollMax - env.state.tollMin) /
                          2.0) + env.state.tollMin
            a[0] = np.ndarray.tolist(numpyFromA)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                    #get other stats and store them too
                    otherStats = env.getAllOtherStats()
                    if np.any(np.isnan(np.array(otherStats))):
                        sys.exit("Nan found in statistics! Error")
                    logger.store(EpTSTT=otherStats[0],
                                 EpRevenue=otherStats[1],
                                 EpThroughput=otherStats[2],
                                 EpJAH=otherStats[3],
                                 EpRemVeh=otherStats[4],
                                 EpJAH2=otherStats[5],
                                 EpMLViolRMSE=otherStats[6],
                                 EpPerTimeVio=otherStats[7],
                                 EptdJAHMax=otherStats[8])
                    #determine max rev profile
                    if ep_ret > maxRev:
                        maxRev = ep_ret
                        maxRevActionSeq = env.state.tollProfile
                        maxRevTSTT = otherStats[0]
                        maxRevRevenue = otherStats[1]
                        maxRevThroughput = otherStats[2]
                        maxRevJAH = otherStats[3]
                        maxRevRemVeh = otherStats[4]
                        maxRevJAH2 = otherStats[5]
                        maxRevRMSE_MLvio = otherStats[6]
                        maxRevPerTimeVio = otherStats[7]
                        maxRevHOTDensity = env.getHOTDensityData()
                        maxRevGPDensity = env.getGPDensityData()
                        maxtdJAHMax = otherStats[8]
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpTSTT', average_only=True)
        logger.log_tabular('EpRevenue', average_only=True)
        logger.log_tabular('EpThroughput', average_only=True)
        logger.log_tabular('EpJAH', average_only=True)
        logger.log_tabular('EpRemVeh', average_only=True)
        logger.log_tabular('EpJAH2', average_only=True)
        logger.log_tabular('EpMLViolRMSE', average_only=True)
        logger.log_tabular('EpPerTimeVio', average_only=True)
        logger.log_tabular('EptdJAHMax', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    print("Max cumulative reward obtained= %f " % maxRev)
    print(
        "Corresponding revenue($)= %f, TSTT(hrs)= %f, Throughput(veh)=%f, JAHstat= %f, remaining vehicles= %f, JAHstat2=%f, RMSEML_vio=%f, percentTimeViolated(%%)=%f, tdJAHMax= %f"
        %
        (maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH, maxRevRemVeh,
         maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio, maxtdJAHMax))
    outputVector = [
        maxRev, maxRevRevenue, maxRevTSTT, maxRevThroughput, maxRevJAH,
        maxRevRemVeh, maxRevJAH2, maxRevRMSE_MLvio, maxRevPerTimeVio,
        maxtdJAHMax
    ]
    #print("\n===Max rev action sequence is\n",maxRevActionSeq)
    exportTollProfile(maxRevActionSeq, logger_kwargs, outputVector)
    exportDensityData(maxRevHOTDensity, maxRevGPDensity, logger_kwargs)
コード例 #5
0
def sac1(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=3000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         lr=1e-4,
         alpha=0.2,
         batch_size=150,
         start_steps=9000,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['obs_dim'] = obs_dim
    h_size = ac_kwargs["h_size"]  # hidden size of rnn
    seq_length = ac_kwargs["seq"]  # seq length of rnn

    # Inputs to computation graph
    seq = None  # training and testing doesn't has to have the same seq length
    x_ph, a_ph, r_ph, d_ph = core.placeholders([seq, obs_dim], [seq, act_dim],
                                               [seq, 1], [seq, 1])
    s_t_0 = tf.placeholder(shape=[None, h_size],
                           name="pre_state",
                           dtype="float32")  # zero state

    # Main outputs from computation graph
    outputs, states = cudnn_rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"])
    # outputs, _ = rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"])
    # outputs = mlp(outputs, [ac_kwargs["h_size"], ac_kwargs["h_size"]], activation=tf.nn.elu)
    # states = outputs[:, -1, :]

    # if use model predict next state (obs)
    with tf.variable_scope("model"):
        """hidden size for mlp
           h_size for RNN
        """
        s_predict = mlp(tf.concat([outputs, a_ph], axis=-1),
                        list(ac_kwargs["hidden_sizes"]) +
                        [ac_kwargs["h_size"]],
                        activation=tf.nn.relu)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(
            x_ph, a_ph, s_t_0, outputs, states, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, q1_pi_, q2_pi_ = actor_critic(x_ph, a_ph, s_t_0,
                                                     outputs, states,
                                                     **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 h_size=h_size,
                                 seq_length=seq_length,
                                 flag="seq",
                                 normalize=ac_kwargs["norm"])

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', "model"])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t model: %d \n'
        % var_counts)

    if alpha == 'auto':
        # target_entropy = (-np.prod(env.action_space.shape))
        target_entropy = -np.prod(env.action_space.shape)

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=ac_kwargs["h0"])
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(
            -log_alpha * tf.stop_gradient(logp_pi[:, :-1, :] + target_entropy))
        # Use smaller learning rate to make alpha decay slower
        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])

    # model train op
    # we can't use s_T to predict s_T+1
    delta_x = tf.stop_gradient(
        outputs[:, 1:, :] -
        outputs[:, :-1, :])  # predict delta obs instead of obs
    # model_loss = tf.abs((1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x[:, :, :obs_dim-act_dim]))
    model_loss = tf.abs(
        (1 - d_ph[:, :-1, :]) *
        (s_predict[:, :-1, :] - delta_x))  # how about "done" state
    model_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    if "m" in ac_kwargs["opt"]:
        value_params_1 = get_vars('model') + get_vars('rnn')
    else:
        value_params_1 = get_vars('model')
    # opt for optimize model
    train_model_op = model_optimizer.minimize(tf.reduce_mean(model_loss),
                                              var_list=value_params_1)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(tf.minimum(q1_pi_, q2_pi_) - alpha * logp_pi)
    # clip curiosity
    in_r = tf.stop_gradient(
        tf.reduce_mean(tf.clip_by_value(model_loss, 0, 64),
                       axis=-1,
                       keepdims=True))
    beta = tf.placeholder(dtype=tf.float32, shape=(), name="beta")
    # beta = ac_kwargs["beta"]  # adjust internal reward
    # can we prove the optimal value of beta
    # I think beta should decrease with training going on
    # beta = alpha  # adjust internal reward
    q_backup = r_ph[:, :-1, :] + beta * in_r + gamma * (
        1 - d_ph[:, :-1, :]) * v_backup[:, 1:, :]

    # Soft actor-critic losses
    # pi_loss = tf.reduce_mean(alpha * logp_pi[:, :-1, :] - q1_pi[:, :-1, :])
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    # in some case, the last timestep Q function is super important so maybe we can use weight sum of loss
    # calculate last timestep separately for convince
    # q1_loss = 0.5 * tf.reduce_mean((q1[:, :-1, :] - q_backup) ** 2)
    q1_loss = tf.reduce_mean((q1[:, :-1, :] - q_backup)**2)
    q2_loss = tf.reduce_mean((q2[:, :-1, :] - q_backup)**2)
    # q2_loss = 0.5 * tf.reduce_mean((q2[:, :-1, :] - q_backup) ** 2)
    Q_loss = q1[:, :-1, :] - q_backup
    P_loss = alpha * logp_pi - q1_pi
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    # train model first
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    with tf.control_dependencies([train_model_op]):
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    if "q" in ac_kwargs["opt"]:
        value_params = get_vars('main/q') + get_vars('rnn')
    else:
        value_params = get_vars('main/q')

    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in non_deterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
            tf.identity(alpha), model_loss, train_model_op, train_pi_op,
            train_value_op, target_update
        ]
    else:
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, model_loss,
            train_model_op, train_pi_op, train_value_op, target_update,
            train_alpha_op, Q_loss, P_loss
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, s_t_0_, mu, pi, states, deterministic=False):
        """s_t_0_  starting step for testing 1 H"""

        act_op = mu if deterministic else pi
        action, s_t_1_ = sess.run(
            [act_op, states],
            feed_dict={
                x_ph: o.reshape(1, 1, obs_dim),
                a_ph: np.zeros([1, 1, act_dim]),
                s_t_0: s_t_0_
            })
        return action.reshape(act_dim), s_t_1_

    def test_agent(mu, pi, states, n=10):
        # global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            s_0 = np.zeros([1, h_size])
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a, s_1 = get_action(o, s_0, mu, pi, states, deterministic=True)
                s_0 = s_1
                o, r, d, _ = test_env.step(a)
                # test_env.render()
                ep_ret += r
                ep_len += 1
                # replay_buffer.store(o.reshape([1, obs_dim]), a.reshape([1, act_dim]), r, d)
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    # start = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    s_t_0_ = np.zeros([1, h_size])
    episode = 0

    for t in range(total_steps + 1):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t == 0:
            start = time.time()

        if t > start_steps:
            # s_t_0_store = s_t_0_    # hidden state stored in buffer
            a, s_t_1_ = get_action(o,
                                   s_t_0_,
                                   mu,
                                   pi,
                                   states,
                                   deterministic=False)
            s_t_0_ = s_t_1_
        else:
            # s_t_0_store = s_t_0_
            # print(s_t_0_.shape)
            _, s_t_1_ = get_action(o,
                                   s_t_0_,
                                   mu,
                                   pi,
                                   states,
                                   deterministic=False)
            s_t_0_ = s_t_1_
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(
            a
        )  # give back o_t_1 we need store o_t_0 because that is what cause a_t_0
        # print(r)
        # env.render()
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o.reshape([1,
                                       obs_dim]), s_t_0_.reshape([1, h_size]),
                            a.reshape([1, act_dim]), r, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            # fps = (time.time() - start)/200
            # print("{} fps".format(200 / (time.time() - start)))
            print(ep_len)
            episode += 1
            start = time.time()
            beta_ = ac_kwargs["beta"] * (1 - t / total_steps)
            # beta_ = ac_kwargs["beta"] * (1 / t ** 0.5)
            for j in range(int(ep_len)):
                batch = replay_buffer.sample_batch(batch_size)
                # maybe we can store starting state
                feed_dict = {
                    x_ph: batch['obs1'],
                    s_t_0: batch[
                        's_t_0'],  # all zero matrix for zero state in training
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    beta: beta_,
                }
                for _ in range(ac_kwargs["tm"] - 1):
                    batch = replay_buffer.sample_batch(batch_size)
                    # maybe we can store starting state
                    feed_dict = {
                        x_ph: batch['obs1'],
                        s_t_0:
                        batch['s_t_0'],  # stored zero state for training
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done'],
                        beta: beta_,
                    }
                    _ = sess.run(train_model_op, feed_dict)
                outs = sess.run(step_ops, feed_dict)
                # print(outs)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3].flatten(),
                             Q2Vals=outs[4].flatten(),
                             LogPi=outs[5].flatten(),
                             Alpha=outs[6],
                             beta=beta_,
                             model_loss=outs[7].flatten())

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            s_t_0_ = np.zeros([1, h_size
                               ])  # reset s_t_0_ when one episode is finished
            print("one episode duration:", time.time() - start)
            start = time.time()

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            if epoch % 50 == 0:
                np.save("model__mq_loss_{}".format(epoch), outs[7])
                np.save("Q_mq_loss_{}".format(epoch), outs[-2])
                np.save("P_mq_loss_{}".format(epoch), outs[-1])

            # Save model
            # if (epoch % save_freq == 0) or (epoch == epochs - 1):
            #     logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(mu, pi, states)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('Episode', episode)
            logger.log_tabular('name', name)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('beta', average_only=True)
            logger.log_tabular('model_loss', with_min_and_max=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #6
0
ファイル: meil-v3.py プロジェクト: johnhallman/reil-algorithm
def ppo(BASE_DIR,
        expert_density,
        env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        steps_per_epoch=1000,
        epochs=10,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=50,
        train_v_iters=50,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        data_n=10):

    data = {}  # ALL THE DATA

    logger_kwargs = setup_logger_kwargs(args.dir_name, data_dir=BASE_DIR)
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # update rule
    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    policy_distr = Gaussian_Density()
    policy = lambda s: np.random.uniform(
        -2.0, 2.0, size=env.action_space.shape)  # random policy
    policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                       args.iter_length)
    density = policy_distr.density()

    data[0] = {
        'pol_s': policy_distr.num_samples,
        'pol_t': policy_distr.num_trajects
    }

    dist_rewards = []

    # repeat REIL for given number of rounds
    for i in range(args.rounds):

        message = "\nRound {} out of {}\n".format(i + 1, args.rounds)
        reward = lambda s: expert_density(s) / (density(s) + args.eps)

        dist_rewards.append(reward)

        start_time = time.time()
        o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        r = reward(o)  # custom reward

        # Main loop: collect experience in env and update/log each epoch
        for epoch in range(epochs):
            for t in range(local_steps_per_epoch):

                a, v_t, logp_t = sess.run(get_action_ops,
                                          feed_dict={x_ph: o.reshape(1, -1)})

                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                o, old_r, d, _ = env.step(a[0])
                r = reward(o)
                ep_ret += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (t == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = old_r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = reward(o)
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, old_r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    r = reward(o)

            # store model!
            if (epoch == epochs - 1): logger.save_state({'env': env}, None)

            # Perform PPO update!
            update()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts',
                               (epoch + 1) * steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('DeltaLossPi', average_only=True)
            logger.log_tabular('DeltaLossV', average_only=True)
            logger.log_tabular('Entropy', average_only=True)
            logger.log_tabular('KL', average_only=True)
            logger.log_tabular('ClipFrac', average_only=True)
            logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            print(message)

        policy = lambda state: sess.run(
            get_action_ops, feed_dict={x_ph: state.reshape(1, -1)})[0][0]
        data[i] = {
            'pol_s': policy_distr.num_samples,
            'pol_t': policy_distr.num_trajects
        }
        data[i]['rewards'] = evaluate_reward(env, policy, data_n)

        if i != args.rounds - 1:
            policy_distr.train(env, policy, args.trajects, args.distr_gamma,
                               args.iter_length)
            density = policy_distr.density()

    return data, dist_rewards
コード例 #7
0
ファイル: ppo.py プロジェクト: h-aboutalebi/maxent
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        explorer=None,
        eps=.03,
        pretrain_epochs=0):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    total_epochs = epochs + pretrain_epochs

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(total_epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # explore if you are in a pretrain epoch or if eps-greedy
            pre = epoch < pretrain_epochs
            during = random.random() < eps
            if pre or during:
                if explorer is None:
                    raise ValueError('Trying to explore but explorer is None')
                state = env.env.state_vector()
                a = explorer.sample_action(state)

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #8
0
ファイル: sac.py プロジェクト: yining043/SAC-discrete
def sac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # env params
    thresh          = rl_params['thresh']

    # control params
    seed            = rl_params['seed']
    epochs          = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size     = rl_params['replay_size']
    batch_size      = rl_params['batch_size']
    start_steps     = rl_params['start_steps']
    max_ep_len      = rl_params['max_ep_len']
    max_noop        = rl_params['max_noop']
    save_freq       = rl_params['save_freq']
    render          = rl_params['render']

    # rl params
    gamma           = rl_params['gamma']
    polyak          = rl_params['polyak']
    lr              = rl_params['lr']
    grad_clip_val   = rl_params['grad_clip_val']

    alpha                = rl_params['alpha']
    target_entropy_start = rl_params['target_entropy_start']
    target_entropy_stop  = rl_params['target_entropy_stop']
    target_entropy_steps = rl_params['target_entropy_steps']

    train_env, test_env = env_fn(), env_fn()
    obs_space = env.observation_space
    act_space = env.action_space

    tf.set_random_seed(seed)
    np.random.seed(seed)
    train_env.seed(seed)
    train_env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    # get the size after resize
    obs_dim = network_params['input_dims']
    act_dim = act_space.n

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # init a state buffer for storing last m states
    train_state_buffer = StateBuffer(m=obs_dim[2])
    test_state_buffer  = StateBuffer(m=obs_dim[2])

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None)

    # alpha and entropy setup
    max_target_entropy = tf.log(tf.cast(act_dim, tf.float32))
    target_entropy_prop_ph =  tf.placeholder(dtype=tf.float32, shape=())
    target_entropy = max_target_entropy * target_entropy_prop_ph

    log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)

    if alpha == 'auto': # auto tune alpha
        alpha = tf.exp(log_alpha)
    else: # fixed alpha
        alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, pi_logits, q1_logits, q2_logits, q1_a, q2_a, q1_pi, q2_pi = build_models(x_ph, a_ph, act_dim, network_params)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_targ, _, _, _,  _, _, q1_pi_targ, q2_pi_targ = build_models(x2_ph, a_ph, act_dim, network_params)

    # Count variables
    var_counts = tuple(count_vars(scope) for scope in
                       ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t alpha: %d, \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

    # Min Double-Q: (check the logp_pi bit)
    min_q_pi = tf.minimum(q1_pi, q2_pi)
    min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ)

    # Targets for Q regression
    q_backup = r_ph + gamma*(1-d_ph)*tf.stop_gradient(min_q_pi_targ - alpha * logp_pi_targ)

    # critic losses
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
    value_loss = q1_loss + q2_loss

    # actor loss
    pi_loss = tf.reduce_mean(alpha*logp_pi - min_q_pi)

    # alpha loss for temperature parameter
    alpha_backup = tf.stop_gradient(logp_pi + target_entropy)
    alpha_loss   = -tf.reduce_mean(log_alpha * alpha_backup)

    # Policy train op
    # (has to be separate from value train op, because q1_logits appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    if grad_clip_val is not None:
        gvs = pi_optimizer.compute_gradients(pi_loss,  var_list=get_vars('main/pi'))
        capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
        train_pi_op = pi_optimizer.apply_gradients(capped_gvs)
    else:
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_pi_op]):
        if grad_clip_val is not None:
            gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q'))
            capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
            train_value_op = value_optimizer.apply_gradients(capped_gvs)
        else:
            train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q'))

    # Alpha train op
    alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_value_op]):
        train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha'))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a,
                logp_pi, target_entropy,
                alpha_loss, alpha,
                train_pi_op, train_value_op, train_alpha_op, target_update]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                                outputs={'mu': mu, 'pi': pi, 'q1': q1_a, 'q2': q2_a})

    def get_action(state, deterministic=False):
        state = state.astype('float32') / 255.
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: [state]})[0]

    def reset(env, state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # fire to start game and perform no-op for some frames to randomise start
        o, _, _, _ = env.step(1) # Fire action to start game
        for _ in range(np.random.randint(1, max_noop)):
                o, _, _, _ = env.step(0) # Action 'NOOP'

        o = process_image_observation(o, obs_dim, thresh)
        r = process_reward(r)
        old_lives = env.ale.lives()
        state = state_buffer.init_state(init_obs=o)
        return o, r, d, ep_ret, ep_len, old_lives, state

    def test_agent(n=10, render=True):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len, test_old_lives, test_state = reset(test_env, test_state_buffer)
            terminal_life_lost_test = False

            if render: test_env.render()

            while not(d or (ep_len == max_ep_len)):

                # start by firing
                if terminal_life_lost_test:
                    a = 1
                else:
                    # Take  lower variance actions at test(noise_scale=0.05)
                    a = get_action(test_state, True)

                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(a)
                o = process_image_observation(o, obs_dim, thresh)
                r = process_reward(r)
                test_state = test_state_buffer.append_state(o)
                ep_ret += r
                ep_len += 1

                if test_env.ale.lives() < test_old_lives:
                    test_old_lives = test_env.ale.lives()
                    terminal_life_lost_test = True
                else:
                    terminal_life_lost_test = False

                if render: test_env.render()

            if render: test_env.close()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # ================== Main training Loop  ==================

    start_time = time.time()
    o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer)
    total_steps = steps_per_epoch * epochs

    target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps)
    save_iter = 0
    terminal_life_lost = False

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # press fire to start
        if terminal_life_lost:
            a = 1
        else:
            if t > start_steps:
                a = get_action(state)
            else:
                a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        o2        = process_image_observation(o2, obs_dim, thresh)
        r         = process_reward(r)
        one_hot_a = process_action(a, act_dim)

        next_state = train_state_buffer.append_state(o2)

        ep_ret += r
        ep_len += 1

        if train_env.ale.lives() < old_lives:
            old_lives = train_env.ale.lives()
            terminal_life_lost = True
        else:
            terminal_life_lost = False

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(state, one_hot_a, r, next_state, terminal_life_lost)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph:  batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph:  batch['acts'],
                             r_ph:  batch['rews'],
                             d_ph:  batch['done'],
                             target_entropy_prop_ph: target_entropy_prop
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],    LossQ2=outs[2],
                             Q1Vals=outs[3],    Q2Vals=outs[4],
                             LogPi=outs[5], TargEntropy=outs[6],
                             LossAlpha=outs[7], Alpha=outs[8])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env, train_state_buffer)


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # update target entropy every epoch
            target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps)

            # Save model
            if save_freq is not None:
                if (epoch % save_freq == 0) or (epoch == epochs-1):
                    print('Saving...')
                    logger.save_state({'env': env},  itr=save_iter)
                    save_iter+=1

            # Test the performance of the deterministic version of the agent.
            test_agent(n=5, render=render)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #9
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.
            
        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    def set_seeds(seed=seed):
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        tf.set_random_seed(seed)
        np.random.seed(seed)
        """
            Enable 100% reproducibility on operations related to tensor and randomness.
            Parameters:
            seed (int): seed value for global randomness
            fast_n_close (bool): whether to achieve efficient at the cost of determinism/reproducibility
        """
        #set_seeds(seed=seed)

        #logging.warning("*******************************************************************************")
        #logging.warning("*** set_global_determinism is called,setting full determinism, will be slow ***")
        #logging.warning("*******************************************************************************")

        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
        # https://www.tensorflow.org/api_docs/python/tf/config/threading/set_inter_op_parallelism_threads
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        #from tfdeterminism import patch
        #patch()

    np.random.seed(seed)
    random.seed(seed)
    set_seeds(seed=2)
    env = env_fn()
    env.seed(seed)
    env.action_space.seed(seed)
    #env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)

        # Target Q-values, using action from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    min_q_targ = tf.minimum(q1_targ, q2_targ)
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.reduce_mean((q1 - backup)**2)
    q2_loss = tf.reduce_mean((q2 - backup)**2)
    q_loss = q1_loss + q2_loss
    global_step1 = tf.Variable(0, trainable=False)
    global_step2 = tf.Variable(0, trainable=False)
    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(
                    get_action(o, 0)
                )  # _ no processing this _ , need to let Spinning up to buffer & let optuna acces _
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:  # update_after=1000, update_every=50
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            #test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            #logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            #logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #10
0
def ppo(env_fn,
        ref_func=None,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=500,
        epochs=10000,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=500,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    t_a_ph = core.placeholder_from_space(env.action_space)
    ret_ph = core.placeholder(None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, t_a_ph, ret_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    print("---------------", local_steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # dagger objectives
    pi_loss = tf.reduce_mean(tf.square(pi - t_a_ph))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old = sess.run([pi_loss, v_loss], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            sess.run(train_pi, feed_dict=inputs)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new = sess.run([pi_loss, v_loss], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(1, epochs + 1, 1):
        for t in range(local_steps_per_epoch):
            a_s, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            a = a_s[0]
            ref_a = call_mpc(env, ref_func)
            if (epoch < 100):
                a = ref_a

            # save and log
            buf.store(o, a, ref_a, r)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #11
0
def sac(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    print("---")
    print("obs_dim:", obs_dim)
    print("act_dim:", act_dim)
    print("act_limit:", act_limit)
    print("env.action_space", env.action_space)

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2 = actor_critic(x_ph, a_ph, **ac_kwargs)

        # ty: placeholder to hold meta strategy param TODO: check meta_log_std dimension
        meta_mu = core.placeholder(act_dim)
        meta_log_std = core.placeholder(act_dim)

        meta_mu_next = core.placeholder(act_dim)
        meta_log_std_next = core.placeholder(act_dim)

        # ty: logp_phi
        logp_phi = core.gaussian_likelihood(a_ph, meta_mu, meta_log_std)
        _, _, logp_phi = core.apply_squashing_func(meta_mu, a_ph, logp_phi)

    with tf.variable_scope('main', reuse=True):
        # compose q with pi, for pi-learning
        _, _, _, q1_pi, q2_pi = actor_critic(x_ph, pi, **ac_kwargs)

        # get actions and log probs of actions for next states, for Q-learning
        _, pi_next, logp_pi_next, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

        # ty: logp_phi_next, make sure the action is from the current policy
        logp_phi_next = core.gaussian_likelihood(pi_next, meta_mu_next,
                                                 meta_log_std_next)
        _, _, logp_phi_next = core.apply_squashing_func(
            meta_mu_next, pi_next, logp_phi_next)

    # Target value network
    with tf.variable_scope('target'):
        # target q values, using actions from *current* policy
        _, _, _, q1_targ, q2_targ = actor_critic(x2_ph, pi_next, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)
    min_q_targ = tf.minimum(q1_targ, q2_targ)

    # Entropy-regularized Bellman backup for Q functions, using Clipped Double-Q targets
    q_backup = tf.stop_gradient(
        r_ph + gamma * (1 - d_ph) *
        (min_q_targ - alpha * logp_pi_next + alpha * logp_phi_next))

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - alpha * logp_phi - min_q_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # ty: temporary values for meta_mu, ...
        # temp0s = np.ones((100,4)) * (-10)
        # ty: temporary variance for meta strategy
        temp1s = np.ones((100, 4))
        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    # ty: fill in correct values
                    meta_mu: np.apply_along_axis(obs2mu, 1, batch['obs1']),
                    meta_log_std: temp1s,
                    meta_mu_next: np.apply_along_axis(obs2mu, 1,
                                                      batch['obs2']),
                    meta_log_std_next: temp1s
                }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             LogPi=outs[5])

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #12
0
def gail(env_fn,traj_dir, actor_critic=core.mlp_actor_critic_add, ac_kwargs=dict(),d_hidden_size =64,d_batch_size = 64,seed=0, 
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=40, train_v_iters=40, lam=0.97, max_ep_len=4000,beta =1e-4,
        target_kl=0.01, logger_kwargs=dict(), save_freq=100,
        r_env_ratio=0,gail_ratio =1, d_itr =20, reward_type = 'negative',
        pretrain_bc_itr =0):
    """

    additional args
    d_hidden_size : hidden layer size of Discriminator
    d_batch_size : Discriminator's batch size

    r_env_ratio,gail_ratio : the weight of rewards from envirionment and gail .Total reward = gail_ratio *rew_gail+r_env_ratio* rew_from_environment
    
    d_itr : The number of iteration of update discriminater 
    reward_type : GAIL reward has three type ['negative','positive', 'AIRL']
    trj_num :the number of trajectory for 
    pretrain_bc_itr: the number of iteration of pretraining by behavior cloeing
    
    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D=Discriminator(env,hidden_size = d_hidden_size,reward_type =reward_type)
    
    
    e_obs = np.loadtxt(traj_dir + '/observations.csv',delimiter=',')
    e_act = np.loadtxt(traj_dir + '/actions.csv',delimiter= ',')#Demo treajectory

    Sibuffer =SIBuffer(obs_dim, act_dim, e_obs,e_act,trj_num= 0, max_size =None)#!sibuf

    assert e_obs.shape[1:] == obs_dim 
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi,pi_std, entropy, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))- beta*entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)#ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()


    BC = BehavioralCloning(sess,pi,logp,x_ph,a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())


    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k:v for k,v in zip(all_phs, buf.get())}#all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:#更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log('Early stopping at step %d due to reaching max kl.'%i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):#vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        
        std, std_ent = sess.run([pi_std,entropy],feed_dict = inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=std_ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),#更新での改善量
                     DeltaLossV=(v_l_new - v_l_old),
                     Std = std)

    start_time = time.time()
    o, r, d, ep_ret_task,ep_ret_gail, ep_len = env.reset(), 0, False, 0,0 , 0


    if pretrain_bc_itr>0:
        BC.learn(Sibuffer.expert_obs,Sibuffer.expert_act ,max_itr =pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            buf.store_rew(r)
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if d:# if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r 
                else:
                    last_val = sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})#v_last=...だったけどこれで良さげ
            
                buf.store_rew(last_val)#if its terminal ,nothing change and if its maxitr last_val is use
                buf.finish_path()
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret_task, EpLen=ep_len)#,EpRet_Sum =ep_ret_sum,EpRet_Gail =ep_ret_gail)
        
                o, r, d, ep_ret_task,ep_ret_sum,ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0, 0

        # Save model
        
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, epoch)


        agent_obs , agent_act = buf.obs_buf, buf.act_buf

        d_batch_size = d_batch_size#or len(agent_obs)//d_itr #update discreminator
        for _t in range(d_itr):
            e_obs_batch ,e_act_batch =Sibuffer.get_random_batch(d_batch_size)
            a_obs_batch  =sample_batch(agent_obs,batch_size = d_batch_size)
            a_act_batch= sample_batch(agent_act,batch_size = d_batch_size)
            D.train(sess, e_obs_batch,e_act_batch , a_obs_batch,a_act_batch )
        js_d = D.get_js_div(sess,Sibuffer.main_obs_buf,Sibuffer.main_act_buf,agent_obs,agent_act)
        #---------------get_gail_reward------------------------------
        rew_gail=D.get_reward(sess,agent_obs, agent_act).ravel()

        buf.rew_buf = gail_ratio *rew_gail+r_env_ratio*buf.rew_buf
        for path_slice in buf.slicelist[:-1]:
            ep_ret_gail = rew_gail[path_slice].sum()
            ep_ret_sum = buf.rew_buf[path_slice].sum()
            logger.store(EpRet_Sum=ep_ret_sum,EpRet_Gail=ep_ret_gail)


        buf.culculate_adv_buf()
        
        # -------------Perform PPO update!--------------------

        update()
        
        logger.store(JS=js_d)


        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('JS', average_only=True)
        #logger.log_tabular('JS_Ratio', average_only=True)    
        logger.dump_tabular()
コード例 #13
0
def ppo(env_fn, actor_critic=core_2.mlp_actor_critic, beta=1, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()  # game environment
    obs_dim = env.observation_space.shape  # get the observe dimension from environment
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #print(env.action_space)
    x_ph, a_ph = core_2.placeholders_from_spaces(env.observation_space, env.action_space)  # 构建神经网络的时候,a_ph还没有
    adv_ph, ret_ph, logp_old_ph, log_old_ph_all = core_2.placeholders(None, None, None, 18)
    #print(logp_old_ph)
    #print(log_old_ph_all)
    # Main outputs from computation graph
    pi, logp, logp_pi, v, logp_all = actor_critic(x_ph, a_ph, **ac_kwargs)  # 目前这里的状态和action都还是放的placeholder

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, log_old_ph_all]

    # Every step, get: action, value, and logprob # 每一步都需要得到action(这里的pi似乎表示action)
    get_action_ops = [pi, v, logp_pi, logp_all]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core_2.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    #print((tf.exp(log_old_ph_all) * (logp - logp_old_ph)))
    kl = tf.reduce_mean(tf.multiply(tf.exp(log_old_ph_all),tf.transpose([logp - logp_old_ph])))
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    #pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))  # 两部分的loss
    pi_loss = -tf.reduce_mean(ratio * adv_ph - beta * kl)

    v_loss = tf.reduce_mean((ret_ph - v) ** 2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # 同步参数
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # 主循环
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, logp_all = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            # 把数据放进 buffer pool 里
            buf.store(o, a, r, v_t, logp_t, logp_all)
            logger.store(VVals=v_t)
            # o 应该代表observation
            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # 打完一局游戏,执行一次更新
        #update()

        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kld = sess.run([train_pi, kl], feed_dict=inputs)
            kld = mpi_avg(kld)
            if kld > 1.5 * target_kl:
                beta = 2 * beta
            if kld < target_kl / 1.5:
                beta = beta / 2
                # logger.log('Early stopping at step %d due to reaching max kl.' % i)
                # break
        logger.store(StopIter=i)
        # 上部分的train是policy,这部分是值函数
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run([pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #14
0
ファイル: sac.py プロジェクト: kekeblom/spinningup
def sac(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    config = Namespace(gamma=0.99,
                       entropy_level=-1,
                       lr=1e-3,
                       batch_size=128,
                       polyak=0.995,
                       replay_size=100000)
    sess = tf.Session()
    sac = SAC(sess, config, env.action_space, env.observation_space)
    sac.initialize()

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': sac.x_ph['input'],
                              'a': sac.a_ph
                          },
                          outputs={
                              'mu': sac.mu,
                              'pi': sac.pi,
                              'q1': sac.q1,
                              'q2': sac.q2,
                              'v': sac.v
                          })

    def test_agent(n=10):
        for j in range(n):
            obs, reward, done, ep_ret, ep_len = test_env.reset(
            ), 0, False, 0, 0
            while not (done or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                obs, reward, done, _ = test_env.step(
                    sac.act({'input': obs[None]}, deterministic=True))
                ep_ret += reward
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    obs, reward, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy.
        """
        if t > start_steps:
            action = sac.act({'input': obs[None]})
        else:
            action = env.action_space.sample()

        # Step the env
        obs_next, reward, done, _ = env.step(action)
        ep_ret += reward
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        done = False if ep_len == max_ep_len else done

        # Store experience to replay buffer
        sac.observe({'input': obs}, action, reward, {'input': obs_next}, done)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        obs = obs_next

        if done or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """

            for j in range(ep_len):
                outs = sac.train()
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             LossV=outs[3],
                             Q1Vals=outs[4],
                             Q2Vals=outs[5],
                             VVals=outs[6],
                             LogPi=outs[7])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            obs, reward, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #15
0
ファイル: ddpg.py プロジェクト: IST-DASLab/dISTiller-micronet
def ddpg(env, test_env, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
         act_noise=0.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    # Neta - logger is giving me some problems, so ignore
    # logger.save_config(locals())

    # Neta - disable seeding
    # tf.set_random_seed(seed)
    # np.random.seed(seed)

    # Neta - we pass the gym environments to DDPG, instead of using a factory method
    # env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    msglogger.info('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q-backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        msglogger.info("spinup_ddpg: pi a={}".format(a))
        a += noise_scale * np.random.randn(act_dim)
        msglogger.info("spinup_ddpg: pi a={} after adding noise".format(a))
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    msglogger.info("spinup_ddpg [after reset]: o={} r={} d={}".format(o, r, d))
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        msglogger.info("spinup_ddpg: o2={} r={} d={}".format(o2, r, d))
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """

            # Neta
            # stats = ('Peformance/Validation/',
            #          OrderedDict([('act_noise', act_noise)]))
            #
            # distiller.log_training_progress(stats, None,
            #                                 self.episode, steps_completed=self.current_layer_id,
            #                                 total_steps=self.amc_cfg.conv_cnt, log_freq=1, loggers=[self.tflogger])
            # Neta: noise decay
            if t > start_steps:
                act_noise = act_noise * 0.97

            # Neta: don't learn while in heatup
            if t > start_steps:
                for _ in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                 }

                    # Q-learning update
                    outs = sess.run([q_loss, q, train_q_op], feed_dict)
                    logger.store(LossQ=outs[0], QVals=outs[1])

                    # Policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #16
0
ファイル: vpg.py プロジェクト: moshebeutel/spinningup
def simple_vpg(env_fn=lambda: gym.make('CartPole-v1'),
               actor_critic=None,
               ac_kwargs=dict(),
               seed=0,
               steps_per_epoch=5000,
               epochs=10000,
               gamma=0.99,
               pi_lr=1e-5,
               logger_kwargs=dict(),
               save_freq=10):

    # Global variables
    num_of_train_iterations = epochs
    # `number_of_layers` hidden layers with `hidden_dim` units each
    hidden_dim = 32
    number_of_layers = 2

    learning_rate = pi_lr
    batch_size = steps_per_epoch
    discount_factor = gamma

    #init

    # init log
    logger = EpochLogger(**logger_kwargs)

    logger.save_config(locals())

    #make gym enviornment
    env = env_fn()
    obs_dim = env.observation_space.shape[0]

    # model - build computation graph
    with tf.variable_scope('model'):
        #input layer
        obs_ph = tf.placeholder(dtype=np.float32,
                                shape=(None, obs_dim),
                                name='obs_ph')
        #mlp (Multi Layer Perceptron) - hidden layers
        #define sizes for hidden layers
        hidden_sizes = [hidden_dim] * number_of_layers
        #build hidden layers
        x = obs_ph
        for h in hidden_sizes:
            x = tf.layers.dense(x, units=h, activation=tf.tanh)
        #logits
        logits = tf.layers.dense(x, units=env.action_space.n, activation=None)
        #actions - output layer
        actions = tf.squeeze(tf.multinomial(logits=logits, num_samples=1),
                             axis=1)

    #define loss function

    rtgs_ph = tf.placeholder(shape=(None, ), dtype=tf.float32, name='rtgs_ph')
    acts_ph = tf.placeholder(shape=(None, ), dtype=tf.int32, name='acts_ph')
    action_mask = tf.one_hot(acts_ph, env.action_space.n)
    log_probs = tf.reduce_sum(action_mask * tf.nn.softmax(logits=logits),
                              axis=1)
    loss = -tf.reduce_mean(rtgs_ph * log_probs)

    #define optimizer
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(loss)

    #session init
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())

    logger.setup_tf_saver(session,
                          inputs={'x': obs_ph},
                          outputs={'pi': actions})

    #train
    for iter in range(num_of_train_iterations):
        #single train iteration

        #reset batch data
        batch_obs, batch_acts, batch_rtgs, batch_rets, batch_lens = [], [], [], [], []
        #reset episodic data
        obs, reward, done, ep_rews = env.reset(), 0, False, []
        while len(batch_obs) < batch_size:
            #record batch
            # env.render()
            batch_obs.append(obs.copy())
            selected_action = session.run(actions,
                                          {obs_ph: obs.reshape(1, -1)})[0]
            obs, reward, done, _ = env.step(selected_action)
            batch_acts.append(selected_action)
            ep_rews.append(reward)
            if done:
                #end of episode
                batch_rets.append(sum(ep_rews))
                batch_lens.append(len(ep_rews))
                #calc reward to go
                n = len(ep_rews)
                x = np.array(ep_rews)
                #prepare array of powers of dicount factor
                # y = discount_factor ** np.arange(n)
                # rtgs_array = np.zeros_like(x, dtype=np.float32)
                # for step in range(n):
                #     rtgs_array[step] = sum(x[step:]*y[:n-step])
                rtgs_array = np.zeros_like(x)
                for i in reversed(range(n)):
                    rtgs_array[i] = x[i] + (rtgs_array[i +
                                                       1] if i + 1 < n else 0)
                batch_rtgs += list(rtgs_array)
                #reset episodic data
                obs, reward, done, ep_rews = env.reset(), 0, False, []

        if (iter % save_freq == 0) or (iter >= num_of_train_iterations - 1):
            logger.save_state({'env': env}, None)
        # TODO: normalize advs trick:
        # batch_advs = np.array(batch_rtgs)
        # batch_advs = (batch_advs - np.mean(batch_advs))/(np.std(batch_advs) + 1e-8)
        feed_dict = {
            obs_ph: np.array(batch_obs[:len(batch_rtgs)]),
            acts_ph: np.array(batch_acts[:len(batch_rtgs)]),
            rtgs_ph: np.array(batch_rtgs)
        }
        batch_loss, _ = session.run([loss, optimizer], feed_dict=feed_dict)

        # log epoch results
        logger.log_tabular('Epoch', iter)
        logger.log_tabular('TotalEnvInteracts', (iter + 1) * batch_size)
        logger.log_tabular('loss', batch_loss)
        logger.log_tabular('AverageEpRet', np.mean(batch_rets))
        logger.log_tabular('epispode mean length', np.mean(batch_lens))
        logger.dump_tabular()
コード例 #17
0
ファイル: dd_dqn.py プロジェクト: ac-93/braille_rl
def dd_dqn(env,
           logger_kwargs=dict(),
           network_params=dict(),
           rl_params=dict(),
           resume_training=False,
           resume_params=dict()):

    logger = EpochLogger(**logger_kwargs)

    if not resume_training:
        save_vars = locals().copy()
        save_vars.pop('env')
        logger.save_config(save_vars)

    # ==== control params ====
    seed = rl_params['seed']
    epochs = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size = rl_params['replay_size']
    update_freq = rl_params['update_freq']
    n_updates = rl_params['n_updates']
    batch_size = rl_params['batch_size']
    start_steps = rl_params['start_steps']
    max_ep_len = rl_params['max_ep_len']
    num_tests = rl_params['num_tests']
    save_freq = rl_params['save_freq']

    # ==== rl params ====
    use_HER = rl_params['use_HER']
    use_prev_a = rl_params['use_prev_a']
    gamma = rl_params['gamma']
    polyak = rl_params['polyak']
    q_lr = rl_params['q_lr']

    # ==== noise params ====
    act_noise_min = rl_params['act_noise_min']
    act_noise_max = rl_params['act_noise_max']
    act_noise_max_steps = rl_params['act_noise_max_steps']
    test_act_noise = rl_params['test_act_noise']

    # if resuming sess is passed as a param
    if not resume_training:
        sess = tf.compat.v1.Session(config=tf_config)

    # set seeding (still not perfectly deterministic)
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.np_random.seed(seed)

    # get required gym spaces
    obs = env.observation_space
    act = env.action_space

    # get the size after resize
    obs_dim = network_params['input_dims']
    act_dim = act.n
    goal_dim = len(env.goal_list)

    # add action dimension to network params
    network_params['output_dim'] = act_dim

    if not resume_training:
        # init a state buffer for storing last m states
        train_state_buffer = StateBuffer(m=obs_dim[2])
        test_state_buffer = StateBuffer(m=obs_dim[2])

        # Experience buffer
        replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                     act_dim=act_dim,
                                     goal_dim=goal_dim,
                                     size=replay_size)

        # Inputs to computation graph
        x_ph, a_ph, prev_a_ph, x2_ph, r_ph, d_ph, g_ph = placeholders(
            obs_dim, act_dim, act_dim, obs_dim, None, None, goal_dim)

        # Main outputs from computation graph
        with tf.variable_scope('main'):
            value_x1, advantage_x1, value_x2, advantage_x2 = action_value_networks(
                x_ph, x2_ph, use_prev_a, a_ph, prev_a_ph, g_ph, network_params)

        # Target networks
        with tf.variable_scope('target'):
            _, _, value_targ_x2, advantage_targ_x2 = action_value_networks(
                x_ph, x2_ph, use_prev_a, a_ph, prev_a_ph, g_ph, network_params)

        var_counts = tuple(
            count_vars(scope) for scope in ['main/q', 'target/q'])
        print("""\nNumber of parameters:
                   main q: %d
                   target q: %d \n""" % var_counts)

        # combine value and advantage functions
        q_x1 = value_x1 + tf.subtract(
            advantage_x1, tf.reduce_mean(advantage_x1, axis=1, keepdims=True))
        q_x2 = value_x2 + tf.subtract(
            advantage_x2, tf.reduce_mean(advantage_x2, axis=1, keepdims=True))
        q_targ = value_targ_x2 + tf.subtract(
            advantage_targ_x2,
            tf.reduce_mean(advantage_targ_x2, axis=1, keepdims=True))

        # get the index of the maximum q value, corresponds with action taken
        pi = tf.argmax(q_x1, axis=1)

        # get q values for actions taken
        q_val = tf.reduce_sum(tf.multiply(q_x1, a_ph), axis=1)

        # Double QL uses maximum action from main network but q value from target
        max_q_x2 = tf.one_hot(tf.argmax(q_x2, axis=1), depth=act_dim)
        q_targ_val = tf.reduce_sum(tf.multiply(q_targ, max_q_x2), axis=1)

        # Bellman backup for Q function
        q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_targ_val)

        # mean squared error loss
        q_loss = 0.5 * tf.reduce_mean((q_val - q_backup)**2)

        # set up optimizer
        trainable_vars = get_vars('main/q')
        q_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=q_lr,
                                                       epsilon=1e-04)
        train_q_op = q_optimizer.minimize(q_loss,
                                          var_list=trainable_vars,
                                          name='train_q_op')

        # Polyak averaging for target variables (polyak=0.00 for hard update)
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ],
                                 name='target_update')

        # Initializing targets to match main variables
        target_init = tf.group([
            tf.compat.v1.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    else:
        # if resuming define all the ph and outputs from saved model
        # inputs
        x_ph = resume_params['model']['x_ph']
        a_ph = resume_params['model']['a_ph']
        prev_a_ph = resume_params['model']['prev_a_ph']
        x2_ph = resume_params['model']['x2_ph']
        r_ph = resume_params['model']['r_ph']
        d_ph = resume_params['model']['d_ph']
        g_ph = resume_params['model']['g_ph']

        # outputs
        pi = resume_params['model']['pi']
        q_loss = resume_params['model']['q_loss']
        q_x1 = resume_params['model']['q_x1']

        # small buffers
        replay_buffer = resume_params['resume_state']['replay_buffer']
        train_state_buffer = resume_params['resume_state'][
            'train_state_buffer']
        test_state_buffer = resume_params['resume_state']['test_state_buffer']

        # get needed operations from graph by name (trouble saving these)
        train_q_op = tf.get_default_graph().get_operation_by_name("train_q_op")
        target_update = tf.get_default_graph().get_operation_by_name(
            "target_update")

    if not resume_training:
        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(target_init)
    else:
        sess = resume_params['sess']

    # Setup model saving
    if save_freq is not None:
        logger.setup_tf_saver(sess,
                              inputs={
                                  'x_ph': x_ph,
                                  'a_ph': a_ph,
                                  'prev_a_ph': prev_a_ph,
                                  'x2_ph': x2_ph,
                                  'r_ph': r_ph,
                                  'd_ph': d_ph,
                                  'g_ph': g_ph
                              },
                              outputs={
                                  'pi': pi,
                                  'q_loss': q_loss,
                                  'q_x1': q_x1
                              })

    def get_action(state, one_hot_goal, prev_a, noise_scale):
        state = state.astype('float32') / 255.
        if np.random.random_sample() < noise_scale:
            a = env.action_space.sample()
        else:
            a = sess.run(pi,
                         feed_dict={
                             x_ph: [state],
                             g_ph: [one_hot_goal],
                             prev_a_ph: [prev_a]
                         })[0]
        return a

    def reset(state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        o = process_image_observation(o, obs_dim)
        r = process_reward(r)
        state = state_buffer.init_state(init_obs=o)
        prev_a = np.zeros(act_dim)

        # new random goal when the env is reset
        goal_id = np.random.randint(goal_dim)
        one_hot_goal = np.eye(goal_dim)[goal_id]
        goal = env.goal_list[goal_id]
        env.goal_button = goal
        # print('Goal Button: {}'.format(goal))

        return o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a

    def test_agent(n=1):
        print('Testing...')
        for j in range(n):
            test_o, test_r, test_d, test_ep_ret, test_ep_len, test_state, test_one_hot_goal, test_prev_a = reset(
                test_state_buffer)

            while not (test_d or (test_ep_len == max_ep_len)):

                test_a = get_action(test_state, test_one_hot_goal, test_prev_a,
                                    test_act_noise)

                test_o, test_r, test_d, _ = env.step(test_a)
                test_o = process_image_observation(test_o, obs_dim)
                test_r = process_reward(test_r)
                test_state = test_state_buffer.append_state(test_o)

                test_ep_ret += test_r
                test_ep_len += 1

                test_one_hot_a = process_action(test_a, act_dim)
                test_prev_a = test_one_hot_a

            logger.store(TestEpRet=test_ep_ret, TestEpLen=test_ep_len)

    # ================== Main training Loop  ==================
    if not resume_training:
        start_time = time.time()
        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset(
            train_state_buffer)

        total_steps = steps_per_epoch * epochs
        act_noise = update_eps(current_step=0,
                               min_eps=act_noise_min,
                               max_eps=act_noise_max,
                               max_steps=act_noise_max_steps)
        resume_t = 0

        # array for storing states used with HER
        if use_HER:
            HER_buffer = HERBuffer(obs_dim=obs_dim,
                                   act_dim=act_dim,
                                   goal_dim=goal_dim,
                                   size=max_ep_len)

    # resuming training
    else:
        start_time = time.time()
        total_steps = steps_per_epoch * (epochs +
                                         resume_params['additional_epochs'])
        act_noise = resume_params['resume_state']['act_noise']
        HER_buffer = resume_params['resume_state']['HER_buffer']
        resume_t = resume_params['resume_state']['resume_t']
        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = resume_params[
            'resume_state']['rl_state']

        # reset the environment to the state set before saving
        env.set_env_state(resume_params['resume_state']['env_state'])

    # Main loop: collect experience in env and update/log each epoch
    for t in range(resume_t, total_steps):

        if t > start_steps:
            a = get_action(state, one_hot_goal, prev_a, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        o2 = process_image_observation(o2, obs_dim)  # thresholding done in env
        r = process_reward(r)
        one_hot_a = process_action(a, act_dim)

        next_state = train_state_buffer.append_state(o2)

        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        # if life is lost then store done as true true
        replay_buffer.store(state, one_hot_a, prev_a, r, next_state, d,
                            one_hot_goal)

        # append to HER buffer
        if use_HER:
            HER_buffer.store(state, one_hot_a, prev_a, r, next_state, d,
                             one_hot_goal)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state
        prev_a = one_hot_a

        # store additional states in replay buffer where the goal
        # is given by the final state, if the final state was incorrect
        if use_HER:
            if d and (ep_len != max_ep_len):

                # get actual goal achieved
                achieved_goal = np.eye(goal_dim)[env.goal_list.index(
                    env.latest_button)]

                # if an incorrect goal was reached
                if (achieved_goal != one_hot_goal).any():

                    for j in range(ep_len):
                        # pull data from HER buffer
                        sample = HER_buffer.sample(j)

                        # change this to calc_rew function in env
                        if j == ep_len - 1:
                            new_rew = env.max_rew
                        else:
                            new_rew = sample['rews']

                        # add to replay buffer
                        replay_buffer.store(sample['obs1'], sample['acts'],
                                            sample['prev_acts'], new_rew,
                                            sample['obs2'], sample['done'],
                                            achieved_goal)

        # do a single update
        if t > 0 and t % update_freq == 0:
            for i in range(n_updates):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    prev_a_ph: batch['prev_acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    g_ph: batch['goal']
                }

                # Q-learning update
                outs = sess.run([q_loss, q_x1, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

        if d or (ep_len == max_ep_len):
            # store episode values
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            # reset the environment
            o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset(
                train_state_buffer)

            if use_HER:
                # reset HER buffer
                HER_buffer.reset()

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:

            epoch = t // steps_per_epoch

            # update target network
            outs = sess.run(target_update)

            # update actor noise every epoch
            act_noise = update_eps(current_step=t,
                                   min_eps=act_noise_min,
                                   max_eps=act_noise_max,
                                   max_steps=act_noise_max_steps)

            # save everything neccessary for restarting training from current position
            env_state = env.get_env_state()

            # Save model
            if save_freq is not None:
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    print('Saving...')
                    rl_state = [
                        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a
                    ]
                    logger.save_state(
                        state_dict={
                            'env_state': env_state,
                            'replay_buffer': replay_buffer,
                            'train_state_buffer': train_state_buffer,
                            'test_state_buffer': test_state_buffer,
                            'HER_buffer': HER_buffer,
                            'act_noise': act_noise,
                            'resume_t': t + 1,
                            'rl_state': rl_state
                        })
            # Test the performance of the deterministic version of the agent. (resets the env)
            test_agent(n=num_tests)

            # set params for resuming training
            env.set_env_state(env_state)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Actor Noise', act_noise)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

    plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'),
                  show_plot=False)
コード例 #18
0
def ppo(env_fn,
        GUI=True,
        actor_critic=my_mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        on_policy=True,
        prev_epochs=0):
    """
    Proximal Policy Optimization (by clipping),

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        GUI : Whether or not display GUI during training.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`.

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    if GUI:
        env = env_fn("GUI", prev_epochs)
    else:
        env = env_fn("DIRECT", prev_epochs)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    sess = tf.Session()

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    # Main outputs from computation graph
    pi, logp, logp_pi, v, mu, log_std = actor_critic(x_ph, a_ph, **ac_kwargs)

    # if load_path==None:
    #     # Inputs to computation graph
    #     x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    #     # Main outputs from computation graph
    #     pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
    # else:
    #     fname = osp.join(load_path, 'tf1_save')
    #     print('\n\nLoading old model from %s.\n\n' % fname)
    #
    #     # load the things!
    #     model = restore_tf_graph(sess, fname)
    #     x_ph, a_ph = model['x'], model['a']
    #     pi, logp, logp_pi, v = model['pi'], model['logp'], model['logp_pi'], model['v']

    # Calculated through one epoch, assigned by buf's methods
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'v': v,
                              'logp': logp,
                              'logp_pi': logp_pi
                          })

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)
        # lllogp, mmmu, llog_std = sess.run([logp, mu, log_std], feed_dict=inputs)

        # logp is basically the same as logp_old_ph, the error starts from 1e-6,
        # and this error is a little strange...

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        last_noise_time = 0.0
        noise = np.zeros(12)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(
                                          1,
                                          -1)})  # CHANGE THE feed_dict HERE!
            # aa = a.copy()
            # if 2.0 < env.t < 4.0:
            #     # on_policy = False
            #     if env.t - last_noise_time > 0.1:
            #         noise = np.random.uniform(-0.5 * np.pi, 0.5 * np.pi, 12)
            #         last_noise_time += 0.1
            #     a += noise
            #     logp_t = sess.run(logp, feed_dict={x_ph: o.reshape(1, -1), a_ph: a})
            # else:
            #     # on_policy = True
            #     pass
            # print("time:", env.t, a-aa)

            if not on_policy:
                a = np.array([get_action_from_target_policy(env.t)])
                logp_t = sess.run(logp,
                                  feed_dict={
                                      x_ph: o.reshape(1, -1),
                                      a_ph: a
                                  })

            env.history_buffer['last_action'] = a[0]
            for i in range(
                    25):  # Change the frequency of control from 500Hz to 20Hz
                o2, r, d, o2_dict = env.step(a[0])

            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2
            # print(ep_len, d)
            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                if d:
                    last_val = 0
                    # print(o2_dict['position'])
                    # print(np.alltrue(o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]) is True)
                    # print(np.alltrue([o2_dict['position'][i] < -1 for i in [1, 4, 7, 10]]))
                    # print("I did it!!!")
                else:
                    # last_val = sess.run(v, feed_dict={x_ph: o.reshape(1, -1)})
                    last_val = 0
                buf.finish_path(last_val)
                print(ep_ret)

                # logger.store(EpRet=ep_ret+last_val, EpLen=ep_len)
                # if terminal:
                #     o, ep_ret, ep_len = env.reset(), 0, 0

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0
                last_noise_time = 0.0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()
        env.addEpoch()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()  # show the log

        if time.ctime()[-13:-11] == '09':
            break

    env.close()
コード例 #19
0
def asac(env_fn, actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(), seed=0,
         steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000,
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001,
         delta=0.02, sample_step=2000):

    alpha = Alpha(alpha_start=alpha_start, delta=delta)
    alpha_t = alpha()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None)
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    alpha_ph = core.scale_holder()
    # Main outputs from computation graph

    #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target value network
    with tf.variable_scope('target'):
        _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts)
    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi)
    Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ)
    R_backup = tf.stop_gradient(Q_pi)
    adv = Q_pi - R

    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2)
    R_loss = 0.5*tf.reduce_mean((R_backup - R)**2)
    value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss
    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
    """
    R_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R'))
    """
    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
                train_pi_op, train_value_op, target_update, R_loss, Q_loss]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                          outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
    total_steps = steps_per_epoch * epochs

    counter = 0
    ret_epi = []
    obs_epi = []
    loss_old = 10000
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             alpha_ph: alpha_t
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                             VVals=outs[6], LogPi=outs[7], LossR=outs[11])
                counter += 1
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
            logger.store(RetEst=ret_est)
            if counter >= 1000:
                loss_new, _ = logger.get_stats('LossPi')
                counter = 0
                if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps:
                    rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32)
                    rho_ptr = 0
                    for sample_t in range(sample_step):
                        a = get_action(o)
                        o2, r, d, _ = env.step(a)
                        ep_len += 1
                        d = False if ep_len == max_ep_len else d
                        rho_s[rho_ptr] = o
                        o = o2
                        if d or (ep_len == max_ep_len):
                            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    advantages = sess.run(adv, feed_dict={x_ph: rho_s})
                    alpha.update_alpha(advantages)
                    #alpha.update_alpha(rho_q-rho_v)
                    alpha_t = alpha()
                    print(alpha_t)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    loss_old = 10000
                else:
                    loss_old = loss_new
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EntCoeff', alpha_t)
            logger.log_tabular('RetEst', average_only=True)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossR', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #20
0
class DeterministicLearner:
    """ Learner for training Agents with deterministic policies,
    and thus have different behavior during training and testing """
    def __init__(self,
                 agent,
                 env,
                 steps_per_epoch=5000,
                 epochs=50,
                 seed=0,
                 max_ep_len=1000,
                 start_steps=10000,
                 replay_size=int(1e6),
                 batch_size=100,
                 n_test_episodes=10,
                 output_dir=None,
                 output_fname='progress.txt',
                 exp_name=None):
        self.epoch_len, self.n_epochs = steps_per_epoch, epochs
        self.max_ep_len, self.start_steps = max_ep_len, start_steps
        self.n_test_episodes = n_test_episodes
        self.logger = EpochLogger(output_dir=output_dir,
                                  output_fname=output_fname,
                                  exp_name=exp_name)
        print('locals')
        for key, val in locals().items():
            print('{}: {}'.format(key, len(str(val))))
        # self.logger.save_config(locals())
        self.env, self.agent = env, agent
        self.buffer = OffPolicyBuffer(buffer_size=replay_size,
                                      epoch_size=steps_per_epoch,
                                      batch_size=batch_size)
        saver_kwargs = agent.build_graph(env.observation_space,
                                         env.action_space)
        self.logger.setup_tf_saver(**saver_kwargs)
        var_counts = tuple(
            tf_utils.trainable_count(scope) for scope in ['pi', 'q'])
        self.logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' %
                        var_counts)
        np.random.seed(seed)
        tf.set_random_seed(seed)

    def episode_step(self,
                     obs,
                     rew,
                     is_term,
                     ep_len,
                     ep_ret,
                     epoch_ctr,
                     testing=False):
        """ take a single step in the episode """
        # environment variables to store in buffer
        env_to_buffer = dict(obs=obs, rew=rew, is_term=is_term)
        # Take agent step, return values to store in buffer, and in logs
        act = self.agent.step(obs, testing=testing)
        if not testing:
            self.buffer.store({**env_to_buffer, 'act': act})
            epoch_ctr += 1
        ep_len += 1
        ep_ret += rew
        obs, rew, is_term, _ = self.env.step(act)
        return obs, rew, is_term, ep_len, ep_ret, epoch_ctr

    def play_episode(self, epoch_ctr=0, testing=False):
        """ play out an episode until one of these things happens:
        1. episode ends
        2. max episode length is reached
        3. end of epoch is reached """
        obs = self.env.reset()
        rew, ep_len, ep_ret, is_term_state = 0, 0, 0, False
        while ((ep_len < self.max_ep_len) and (not is_term_state)
               and (epoch_ctr < self.epoch_len)):
            step_ret = self.episode_step(obs,
                                         rew,
                                         is_term_state,
                                         ep_len,
                                         ep_ret,
                                         epoch_ctr,
                                         testing=testing)
            obs, rew, is_term_state, ep_len, ep_ret, epoch_ctr = step_ret
        ep_ret += rew  # important! add the last reward to the return!
        log_prefix = 'Test' if testing else ''
        if (is_term_state) or (ep_len >= self.max_ep_len):
            self.logger.store(**{
                log_prefix + 'EpRet': ep_ret,
                log_prefix + 'EpLen': ep_len
            })
        if not testing:
            self.buffer.finish_path(last_obs=obs)
        return ep_len, ep_ret, epoch_ctr

    def train_episode(self, ep_len):
        """ train agent at the end of episode """
        batches = self.buffer.batches(n_batches=ep_len)
        for train_iter, batch in enumerate(batches):
            to_logger = self.agent.train(train_iter, batch)
            self.logger.store(**to_logger)

    def run_epoch(self):
        """ run epoch of training + evaluation """
        epoch_ctr = 0
        while epoch_ctr < self.epoch_len:
            ep_len, _, epoch_ctr = self.play_episode(epoch_ctr=epoch_ctr,
                                                     testing=False)
            self.train_episode(ep_len)
        self.test_epoch(self.n_test_episodes)

    def test_epoch(self, n_test_episodes):
        """ perform testing for an epoch """
        for _ in range(n_test_episodes):
            self.play_episode(0, testing=True)

    def learn(self):
        """ Train the agent over n_epochs """
        for epoch in range(self.n_epochs):
            start_time = time.time()
            self.run_epoch()
            self.log_epoch(epoch, start_time)
            self.logger.save_state({'env': self.env}, None)
        self.agent.sess.close()

    def log_epoch(self, epoch, start_time):
        """ Log info about epoch """
        self.logger.log_tabular('Epoch', epoch)
        self.logger.log_tabular('EpRet', with_min_and_max=True)
        self.logger.log_tabular('EpLen', average_only=True)
        self.logger.log_tabular('TestEpRet', with_min_and_max=True)
        self.logger.log_tabular('TestEpLen', average_only=True)
        self.logger.log_tabular('TotalEnvInteracts',
                                (epoch + 1) * self.epoch_len)
        self.logger.log_tabular('Time', time.time() - start_time)
        for column_name, kwargs in self.agent.log_tabular_kwargs.items():
            self.logger.log_tabular(column_name, **kwargs)
        self.logger.dump_tabular()
コード例 #21
0
ファイル: trpo.py プロジェクト: eladsar/spinningup
def trpo(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         delta=0.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=0.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=0.8,
         lam=0.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo='trpo'):
    """
    Trust Region Policy Optimization 

    (with support for Natural Policy Gradient)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph, plus placeholders for old pdist (for KL)
    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph
               ] + core.values_as_sorted_list(info_phs)

    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
    get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # TRPO losses
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # Symbols needed for CG solver
    pi_params = core.get_vars('pi')
    gradient = core.flat_grad(pi_loss, pi_params)
    v_ph, hvp = core.hessian_vector_product(d_kl, pi_params)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = core.flat_concat(pi_params)
    set_pi_params = core.assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy(
        )  # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r, r)
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            agent_outs = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[
                1], agent_outs[2], agent_outs[3:]

            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #22
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dims = env.action_space  #[ choice.shape for choice in env.action_space.values() ]
    #act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholder(None), core.placeholder(
        None), {}
    for k in env.action_space:
        logp_old_ph[k] = core.placeholder(None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dims, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio, min_adv, pi_loss = {}, {}, {}
    for k in env.action_space:
        ratio[k] = tf.exp(logp[k] - logp_old_ph[k])  # pi(a|s) / pi_old(a|s)
        min_adv[k] = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                              (1 - clip_ratio) * adv_ph)
        pi_loss[k] = -tf.reduce_mean(tf.minimum(ratio[k] * adv_ph, min_adv[k]))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl, approx_ent, clipped, clipfrac = {}, {}, {}, {}
    for k in env.action_space:
        approx_kl[k] = tf.reduce_mean(
            logp_old_ph[k] -
            logp[k])  # a sample estimate for KL-divergence, easy to compute
        approx_ent[k] = tf.reduce_mean(
            -logp[k])  # a sample estimate for entropy, also easy to compute
        clipped[k] = tf.logical_or(ratio[k] > (1 + clip_ratio), ratio[k] <
                                   (1 - clip_ratio))
        clipfrac[k] = tf.reduce_mean(tf.cast(clipped[k], tf.float32))

    pi_loss_sum = tf.reduce_sum(list(pi_loss.values()))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss_sum)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    save_outputs = {'v': v}
    for k in env.action_space:
        save_outputs['pi_' + k] = pi[k]
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs=save_outputs)

    def update():

        inputs = {}
        for k, v in zip(all_phs, buf.get()):
            if type(k) is not dict:
                inputs[k] = v
            else:
                for k_, v_ in zip(k.values(), v.values()):
                    inputs[k_] = v_

        pi_l_old, v_l_old, ent = sess.run([pi_loss_sum, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            for k in kl:
                kl[k] = mpi_avg(kl[k])
            if max(list(kl.values())) > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss_sum, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        sum_dict = lambda x: x if type(x) is not dict else np.sum(
            list(x.values()))

        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=sum_dict(kl),
                     Entropy=sum_dict(ent),
                     ClipFrac=sum_dict(cf),
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            o2, r, d, _ = env.step(**a)
            env.render()  #force_realtime=True)
            ep_ret += r
            #print ("frame_return: %.4f sofar_EpRet: %.4f" % (r, ep_ret))
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                print("EpRet:", ep_ret)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #23
0
def sac1_carla(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=5000, epochs=100, replay_size=int(3e5), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    obs_space = env.observation_space.spaces[0]
    act_space = env.action_space
    obs_dim = obs_space.shape
    act_dim = act_space.shape

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space


    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
    
    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=list(obs_dim), act_dim=list(act_dim), size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/cnn_layer', 'main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t cnn_layer: %d, \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable( 'log_alpha', dtype=tf.float32, initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
    q_backup = r_ph + gamma*(1-d_ph)*v_backup


    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - tf.stop_gradient(q1_pi))
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    cnn_params = get_vars('main/cnn_layer')
    # Policy train op 
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    pi_params = get_vars('main/pi')
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list = cnn_params + pi_params)

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list = cnn_params + value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha),
                train_pi_op, train_value_op, target_update]
    else:
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha,
                train_pi_op, train_value_op, target_update, train_alpha_op]


    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o[np.newaxis,...]})[0]

    def test_agent(n=1):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == 200)): # max_ep_len
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = [0.35, 0]
            # a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):
            print('EpRet: ',ep_ret, 'ep_len: ', ep_len)
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                            }
                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                            Q1Vals=outs[3], Q2Vals=outs[4],
                            LogPi=outs[5], Alpha=outs[6])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha',average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #24
0
def sigail(env_fn,
           traj_dir,
           actor_critic=core.mlp_actor_critic_add,
           ac_kwargs=dict(),
           d_hidden_size=64,
           seed=0,
           steps_per_epoch=4000,
           epochs=50,
           gamma=0.99,
           clip_ratio=0.2,
           pi_lr=3e-4,
           vf_lr=1e-3,
           train_pi_iters=40,
           train_v_iters=40,
           lam=0.97,
           max_ep_len=4000,
           beta=1e-4,
           target_kl=0.01,
           logger_kwargs=dict(),
           save_freq=100,
           r_env_ratio=0,
           d_itr=20,
           reward_type='negative',
           trj_num=20,
           buf_size=1000,
           si_update_ratio=0.02,
           js_smooth=5,
           buf_update_type='random',
           pretrain_bc_itr=0):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    D = Discriminator(env, hidden_size=d_hidden_size,
                      reward_type=reward_type)  #!add Discriminator object
    D_js_m = JS_div_machine(env, hidden_size=d_hidden_size)

    e_obs = np.zeros((buf_size, obs_dim[0]))
    e_act = np.zeros((buf_size, act_dim[0]))
    Sibuffer = SIBuffer(obs_dim,
                        act_dim,
                        e_obs,
                        e_act,
                        trj_num=trj_num,
                        max_size=buf_size,
                        js_smooth_num=js_smooth)  #!sibuf
    trj_full = False
    assert e_obs.shape[1:] == obs_dim
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, pi_std, entropy, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    #buf_gail = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)#add buffer with TRgail rewards

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(
        ratio * adv_ph, min_adv)) - beta * entropy  #add entropy
    v_loss = tf.reduce_mean((ret_ph - v)**2)  #ret_phには累積報酬のバッファが入る
    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()

    BC = BehavioralCloning(sess, pi, logp, x_ph, a_ph)
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Sync params across processes

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v
                  for k, v in zip(all_phs, buf.get())
                  }  #all_phsは各バッファーに対応するプレースホルダー辞書
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training#ここも変える必要あり? おそらく変えなくて良い
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:  #更新時のklが想定の1.5倍大きいとログをだしてtrainループを着る
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):  #vの更新
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update(新しいロスの計算)
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)

        std, std_ent = sess.run([pi_std, entropy], feed_dict=inputs)
        logger.store(
            LossPi=pi_l_old,
            LossV=v_l_old,
            KL=kl,
            Entropy=std_ent,
            ClipFrac=cf,
            DeltaLossPi=(pi_l_new - pi_l_old),  #更新での改善量
            DeltaLossV=(v_l_new - v_l_old),
            Std=std)

    start_time = time.time()
    o, r, d, ep_ret_task, ep_ret_gail, ep_len = env.reset(), 0, False, 0, 0, 0

    if pretrain_bc_itr > 0:
        BC.learn(Sibuffer.expert_obs,
                 Sibuffer.expert_act,
                 max_itr=pretrain_bc_itr)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            '''
            if t <150:
                env.render()
                time.sleep(0.03)
            '''

            ep_ret_task += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                '''
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                '''

                #!add discriminator train
                '''#終端も加えるならアリッチャあり
                o_reshape = o.reshape(core.combined_shape(1,obs_dim))
                a_reshape = a.reshape(core.combined_shape(1,act_dim))
                agent_obs = np.append(buf.obs_buf[buf.path_slice()],o_reshape,axis = 0)#!o を(obspace,)→(1,obspace)に変換してからアペンド
                agent_act = np.append(buf.act_buf[buf.path_slice()],a_reshape,axis = 0)#終端での状態行動対も加えてDを学習
                '''
                agent_obs = buf.obs_buf[buf.path_slice()]
                agent_act = buf.act_buf[buf.path_slice()]

                #D.train(sess,e_obs,e_act ,agent_obs,agent_act)

                #↓buf.r_gail_buf[slice(buf.path_start_idx+1, buf.ptr+2)] = D.get_reward_buf(sess,agent_obs, agent_act).ravel()#状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                if trj_full:
                    gail_r = 1
                else:
                    gail_r = 0
                rew_gail = gail_r * D.get_reward(
                    sess, agent_obs,
                    agent_act).ravel()  #状態行動対の結果としての報酬をbufferに追加(報酬は一個ずれる)

                ep_ret_gail += rew_gail.sum()  #!before gail_ratio
                ep_ret_sum = r_env_ratio * ep_ret_task + ep_ret_gail

                rew_gail_head = rew_gail[:-1]
                last_val_gail = rew_gail[-1]

                buf.rew_buf[slice(
                    buf.path_start_idx + 1,
                    buf.ptr)] = rew_gail_head + r_env_ratio * buf.rew_buf[
                        slice(buf.path_start_idx + 1,
                              buf.ptr)]  #!add GAIL reward 最後の報酬は含まれないため長さが1短い

                if d:  # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r_env_ratio * r + last_val_gail
                else:
                    last_val = sess.run(v,
                                        feed_dict={x_ph: o.reshape(1, -1)
                                                   })  #v_last=...だったけどこれで良さげ

                buf.finish_path(
                    last_val)  #これの前にbuf.finish_add_r_vがなされていることを確認すべし
                if terminal:
                    #only store trajectory to SIBUffer if trajectory finished
                    if trj_full:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    else:
                        Sibuffer.store(
                            agent_obs, agent_act,
                            sum_reward=ep_ret_task)  #!store trajectory
                    logger.store(EpRet=ep_ret_task,
                                 EpRet_Sum=ep_ret_sum,
                                 EpRet_Gail=ep_ret_gail,
                                 EpLen=ep_len)

                o, r, d, ep_ret_task, ep_ret_sum, ep_ret_gail, ep_len = env.reset(
                ), 0, False, 0, 0, 0, 0

        # Save model

        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, epoch)

        # Perform PPO update!
        if not (trj_full):
            M_obs_buf = Sibuffer.get_obs_trj()
        trj_full = (M_obs_buf.shape[0] >= buf_size)

        if trj_full:  #replaybufferがr_thresholdよりも大きいとき
            Sibuffer.update_main_buf(ratio_update=si_update_ratio,
                                     update_type=buf_update_type)
            M_obs_buf = Sibuffer.get_obs_trj()
            M_act_buf = Sibuffer.get_act_trj()

            d_batch_size = len(agent_obs)
            for _t in range(d_itr):
                e_obs_batch, e_act_batch = Sibuffer.get_random_batch(
                    d_batch_size)

                D.train(sess, e_obs_batch, e_act_batch, agent_obs, agent_act)

                D_js_m.train(sess, M_obs_buf, M_act_buf, e_obs,
                             e_act)  #バッファとエキスパートの距離を見るためにtrain
            js_d = D.get_js_div(sess, Sibuffer.main_obs_buf,
                                Sibuffer.main_act_buf, agent_obs, agent_act)
            js_d_m = D_js_m.get_js_div(sess, M_obs_buf, M_act_buf, e_obs,
                                       e_act)

        else:
            js_d, js_d_m = 0.5, 0.5
        update()

        Sibuffer.store_js(js_d)
        logger.store(JS=js_d,
                     JS_M=js_d_m,
                     JS_Ratio=Sibuffer.js_ratio_with_random)

        # Log info about epoch
        #if epoch%10 == 0:#logger print each 10 epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpRet_Sum', average_only=True)
        logger.log_tabular('EpRet_Gail', average_only=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.log_tabular('Std', average_only=True)
        logger.log_tabular('buffer_r', Sibuffer.buffer_r_average)
        logger.log_tabular('JS', average_only=True)
        logger.log_tabular('JS_M', average_only=True)
        logger.log_tabular('JS_Ratio', average_only=True)
        logger.dump_tabular()
コード例 #25
0
def vpg(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-2,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        #logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', average_only=True)
        #logger.log_tabular('EpLen', average_only=True)
        #logger.log_tabular('VVals', with_min_and_max=True)
        #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        #logger.log_tabular('LossPi', average_only=True)
        #logger.log_tabular('LossV', average_only=True)
        #logger.log_tabular('DeltaLossPi', average_only=True)
        #logger.log_tabular('DeltaLossV', average_only=True)
        #logger.log_tabular('Entropy', average_only=True)
        #logger.log_tabular('KL', average_only=True)
        #logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
コード例 #26
0
def sac(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        explorer=None,
        eps=.03,
        pretrain_epochs=0):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(
            x_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    pretrain_steps = steps_per_epoch * pretrain_epochs
    total_epochs = epochs + pretrain_epochs
    total_steps = steps_per_epoch * total_epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        if t > start_steps:
            a = get_action(o)
        elif pretrain_steps == 0:  # only explore if not pretraining with MaxEnt
            a = env.action_space.sample()

        # use MaxEnt exploration if you are in a pretrain epoch or if eps-greedy
        pre = t < pretrain_steps
        during = random.random() < eps
        if pre or during:
            if explorer is None:
                raise ValueError('Trying to explore but explorer is None')
            state = env.env.state_vector()
            a = explorer.sample_action(state)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             LossV=outs[3],
                             Q1Vals=outs[4],
                             Q2Vals=outs[5],
                             VVals=outs[6],
                             LogPi=outs[7])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #27
0
def cvi_ad(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, alp = 0.8,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, 
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1, decay = None, squash = False):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. 
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    adv_ph = tf.placeholder(dtype = tf.float32, shape = (None,))
    alp_ph = tf.placeholder(dtype =  tf.float32)
    t_step = tf.placeholder(dtype = tf.float32)
    #adv_ph1 = tf.placeholder(dtype = tf.float32, shape = (None,))
    #adv_ph2 = tf.placeholder(dtype = tf.float32, shape = (None,))

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, ad1, ad2, ad1_pi, ad2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
    
    # Target value network
    with tf.variable_scope('target'):
        _, _, _, ad1_targ, ad2_targ, _, _, v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
    
    squash_eps = 1e-2
    if squash:
        print("Squashed")
        squash_func = lambda x: tf.sign(x) * (tf.sqrt(tf.abs(x) + 1) - 1) + x * squash_eps
        squash_ifunc = lambda x: tf.sign(x) * ((tf.sqrt(1 + 4 * squash_eps * (tf.abs(x) + 1 + squash_eps)) - 1)** 2 * (1 / (2 * squash_eps))** 2 - 1)
    else:
        print ("Not Squashed")
        squash_func = lambda x: x
        squash_ifunc = lambda x: x
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    q1 = v + ad1
    q2 = v + ad2
    q1_pi = v + ad1_pi
    q2_pi = v + ad2_pi

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(squash_func(r_ph + gamma*(1-d_ph)*squash_ifunc(v_targ) + alp_ph * adv_ph))
    #q_backup1 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph1)
    #q_backup2 = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ + alp * adv_ph2)

    v_backup = tf.stop_gradient(squash_func(squash_ifunc(min_q_pi) - alpha * logp_pi))

    # Soft actor-critic losses
    #alp = tf.Variable(0.2,dtype=tf.float32)
    #q_min = tf.minimum(q1,q2)
    pi_loss = tf.reduce_mean(alpha * logp_pi - squash_ifunc(min_q_pi))
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op 
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') 
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main') , get_vars('target'))])
        # target_update = tf.group([tf.assign(v_targ, tf.cond(tf.not_equal(t_step%1000,0), lambda: v_targ, lambda: v_main))
        #                           for v_main, v_targ in zip(get_vars('main') , get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
                train_pi_op, train_value_op, target_update]
    
    # adv_op = squash_ifunc(tf.minimum(q1_targ, q2_targ))-squash_ifunc(v_targ)
    adv_op = squash_ifunc(tf.minimum(ad1_targ, ad2_targ))
    #adv_op1 = q1_targ-v_targ
    #adv_op2 = q2_targ-v_targ

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs


    if decay:
       alp_val = 0.2
    else:
       alp_val = alp

    update_step = 0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                update_step+=1
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x2_ph: batch['obs1'],
                             a_ph: batch['acts']
                            }
                advantage = sess.run(adv_op , feed_dict)
                #advantage = sess.run([adv_op1, adv_op2] , feed_dict)
          
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             t_step: update_step,
                             adv_ph : advantage,
                             alp_ph : alp_val
                             #adv_ph1 : advantage[0],
                             #adv_ph2 : advantage[1]
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                             VVals=outs[6], LogPi=outs[7])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            if decay:
               alp_val = eval(decay)(t//steps_per_epoch)

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            logger.log_tabular('VVals', with_min_and_max=True) 
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #28
0
ファイル: ddpg_lp.py プロジェクト: LinghengMeng/spinningup
def ddpg_lp(env_fn, render_env=False, ac_kwargs=dict(), seed=0,
            steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
            polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
            act_noise=0.1, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)

    # Hyper-parameters for NNs
    hidden_sizes = list(ac_kwargs['hidden_sizes'])
    actor_hidden_activation = tf.keras.activations.relu
    actor_output_activation = tf.keras.activations.tanh
    critic_hidden_activation = tf.keras.activations.relu
    critic_output_activation = tf.keras.activations.linear

    # Main actor-critic
    with tf.variable_scope('main'):
        actor = MLP(hidden_sizes + [act_dim],
                    hidden_activation=actor_hidden_activation,
                    output_activation=actor_output_activation)
        critic = MLP(hidden_sizes + [1],
                     hidden_activation=critic_hidden_activation,
                     output_activation=critic_output_activation)
        pi = act_limit * actor(x_ph)
        q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1)), axis=1)
        q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1)), axis=1)

    # Target actor-critic
    with tf.variable_scope('target'):
        actor_targ = MLP(hidden_sizes + [act_dim],
                         hidden_activation=actor_hidden_activation,
                         output_activation=actor_output_activation)
        critic_targ = MLP(hidden_sizes + [1],
                          hidden_activation=critic_hidden_activation,
                          output_activation=critic_output_activation)
        # Note that we only need q_targ(s, pi_targ(s)).
        pi_targ = act_limit * actor_targ(x2_ph)
        q_pi_targ = tf.squeeze(critic_targ(tf.concat([x2_ph, pi_targ], axis=-1)), axis=1)


    # Learning Progress on Actor
    with tf.variable_scope('actor_learning_progress'):
        actor_lp = LearningProgress(memory_length=10, input_dim=obs_dim, output_dim=act_dim,
                                    hidden_sizes = hidden_sizes,
                                    hidden_activation=actor_hidden_activation,
                                    output_activation=actor_output_activation,
                                    logger_kwargs=logger_kwargs,
                                    loger_file_name='actor_lp_log.txt')

    # Learning Progress on Critic
    with tf.variable_scope('critic_learning_progress'):
        critic_lp = LearningProgress(memory_length=10, input_dim=obs_dim+act_dim, output_dim=1,
                                     hidden_sizes=hidden_sizes,
                                     hidden_activation=critic_hidden_activation,
                                     output_activation=critic_output_activation,
                                     logger_kwargs=logger_kwargs,
                                     loger_file_name='critic_lp_log.txt')

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q-backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables)
    train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables)

    # Polyak averaging for target variables
    target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                              for v_main, v_targ in zip(actor.variables+critic.variables,
                                                        actor_targ.variables+critic_targ.variables)])
    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(actor.variables+critic.variables,
                                                      actor_targ.variables + critic_targ.variables)])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def get_learning_progress_based_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ep_actor_lp_norm, ep_actor_var_outputs, ep_actor_var_first_half_outputs, ep_actor_var_second_half_outputs = 0, 0, 0, 0
    ep_critic_lp_norm, ep_critic_var_outputs, ep_critic_var_first_half_outputs, ep_critic_var_second_half_outputs = 0, 0, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Update actor_learning_progress with latest weights
        update_latest_memory_every_n_steps = 1000
        if t % update_latest_memory_every_n_steps==0:
            actor_lp.update_latest_memory(actor.get_weights())

        # Step the env
        if render_env:
            env.render()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # import pdb; pdb.set_trace()
        actor_lp_norm, actor_var_outputs, \
        actor_var_first_half_outputs,\
        actor_var_second_half_outputs = actor_lp.compute_learning_progress(o, sess, t, start_time)
        ep_actor_lp_norm += actor_lp_norm
        ep_actor_var_outputs += actor_var_outputs
        ep_actor_var_first_half_outputs += actor_var_first_half_outputs
        ep_actor_var_second_half_outputs += actor_var_second_half_outputs

        critic_lp_norm, critic_var_outputs, \
        critic_var_first_half_outputs, \
        critic_var_second_half_outputs = critic_lp.compute_learning_progress(np.concatenate((o,a)), sess, t, start_time)
        ep_critic_lp_norm += critic_lp_norm
        ep_critic_var_outputs += critic_var_outputs
        ep_critic_var_first_half_outputs += critic_var_first_half_outputs
        ep_critic_var_second_half_outputs += critic_var_second_half_outputs

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done']
                            }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
                    logger.store(LossPi=outs[0])


            logger.store(EpRet=ep_ret, EpLen=ep_len)
            logger.store(EpActorLPNorm=ep_actor_lp_norm)
            logger.store(EpActorVar=ep_actor_var_outputs)
            logger.store(EpActorVarFirst=ep_actor_var_first_half_outputs)
            logger.store(EpActorVarSecond=ep_actor_var_second_half_outputs)
            logger.store(EpCriticLPNorm=ep_critic_lp_norm)
            logger.store(EpCriticVar=ep_critic_var_outputs)
            logger.store(EpCriticVarFirst=ep_critic_var_first_half_outputs)
            logger.store(EpCriticVarSecond=ep_critic_var_second_half_outputs)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            ep_actor_lp_norm, ep_actor_var_outputs, ep_actor_var_first_half_outputs, ep_actor_var_second_half_outputs = 0, 0, 0, 0
            ep_critic_lp_norm, ep_critic_var_outputs, ep_critic_var_first_half_outputs, ep_critic_var_second_half_outputs = 0, 0, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('EpActorLPNorm', with_min_and_max=True)
            logger.log_tabular('EpActorVar', with_min_and_max=True)
            logger.log_tabular('EpActorVarFirst', with_min_and_max=True)
            logger.log_tabular('EpActorVarSecond', with_min_and_max=True)
            logger.log_tabular('EpCriticLPNorm', with_min_and_max=True)
            logger.log_tabular('EpCriticVar', with_min_and_max=True)
            logger.log_tabular('EpCriticVarFirst', with_min_and_max=True)
            logger.log_tabular('EpCriticVarSecond', with_min_and_max=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #29
0
def sppo(args,
         env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=200,
         target_kl=0.01,
         logger_kwargs=dict(),
         save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    ###########
    if args.alpha == 'auto':
        target_entropy = 0.35

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=tf.log(0.2))
        alpha = tf.exp(log_alpha)
    else:
        alpha = args.alpha
    ###########

    # Main outputs from computation graph
    mu, pi, logp, logp_pi, v, q, h = actor_critic(alpha, x_ph, a_ph,
                                                  **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi, h]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    ######

    if args.alpha == 'auto':
        alpha_loss = tf.reduce_mean(
            -log_alpha * tf.stop_gradient(-h + target_entropy)
        )  # tf.clip_by_value(-h + target_entropy, 0.0, 1000.0 )

        alpha_optimizer = MpiAdamOptimizer(learning_rate=1e-5)
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])

    ######

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)

    # For PPO
    # min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # ### Scheme1: SPPO NO.2: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * tf.stop_gradient(logp)
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    # ### Scheme3: SPPO NO.3: add entropy
    # adv_logp = adv_ph - tf.stop_gradient(alpha) * logp_old_ph
    # min_adv = tf.where(adv_logp>0, (1+clip_ratio)*adv_logp, (1-clip_ratio)*adv_logp)
    # pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_logp, min_adv))

    ### Scheme2: SPPO NO.2: add entropy
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(
        tf.minimum(ratio * adv_ph, min_adv) + tf.stop_gradient(alpha) * h)

    v_loss = tf.reduce_mean((ret_ph - v)**2)  #+(ret_ph - q)**2)/2.0

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        h)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(
        learning_rate=args.pi_lr).minimize(pi_loss + 0.1 * v_loss)
    # train_v = MpiAdamOptimizer(learning_rate=args.vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            if args.alpha == 'auto':
                sess.run(train_alpha_op, feed_dict=inputs)
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        # for _ in range(train_v_iters):
        #     sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old),
                     Alpha=sess.run(alpha) if args.alpha == 'auto' else alpha)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t, h_t = sess.run(get_action_ops,
                                           feed_dict={x_ph: o.reshape(1, -1)})
            # q_t = sess.run(q, feed_dict={x_ph: o.reshape(1,-1), a_ph: a})
            # SPPO NO.1: add entropy
            # rh = r - args.alpha * logp_t
            if args.alpha == 'auto':
                rh = r + sess.run(alpha) * h_t
            else:
                rh = r + alpha * h_t  # exact entropy

            # save and log
            buf.store(o, a, rh, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r

            ep_len += 1
            # d = False if ep_len == max_ep_len else d

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # # Save model
        # if (epoch % save_freq == 0) or (epoch == epochs-1):
        #     logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Alpha', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #30
0
def ppo(workload_file,
        model_path,
        ac_kwargs=dict(),
        seed=0,
        traj_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        pre_trained=0,
        trained_model=None,
        attn=False,
        shuffle=False,
        backfil=False,
        skip=False,
        score_type=0,
        batch_job_slice=0,
        sched_algo=4):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = HPCEnvSkip(shuffle=shuffle,
                     backfil=backfil,
                     skip=skip,
                     job_score_type=score_type,
                     batch_job_slice=batch_job_slice,
                     build_sjf=False,
                     sched_algo=sched_algo)
    env.seed(seed)
    env.my_init(workload_file=workload_file, sched_file=model_path)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['attn'] = attn

    # Inputs to computation graph

    buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE,
                    gamma, lam)

    if pre_trained:
        sess = tf.Session()
        model = restore_tf_graph(sess, trained_model)
        logger.log('load pre-trained model')
        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        x_ph = model['x']
        a_ph = model['a']
        mask_ph = model['mask']
        adv_ph = model['adv']
        ret_ph = model['ret']
        logp_old_ph = model['logp_old_ph']

        pi = model['pi']
        v = model['v']
        # logits = model['logits']
        out = model['out']
        logp = model['logp']
        logp_pi = model['logp_pi']
        pi_loss = model['pi_loss']
        v_loss = model['v_loss']
        approx_ent = model['approx_ent']
        approx_kl = model['approx_kl']
        clipfrac = model['clipfrac']
        clipped = model['clipped']

        # Optimizers
        # graph = tf.get_default_graph()
        # op = sess.graph.get_operations()
        # [print(m.values()) for m in op]
        # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0')
        # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0')
        train_pi = tf.get_collection("train_pi")[0]
        train_v = tf.get_collection("train_v")[0]
        # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad')
        # train_pi = train_pi_optimizer.minimize(pi_loss)
        # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad')
        # train_v = train_v_optimizer.minimize(v_loss)
        # sess.run(tf.variables_initializer(train_pi_optimizer.variables()))
        # sess.run(tf.variables_initializer(train_v_optimizer.variables()))
        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]
        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

    else:
        x_ph, a_ph = placeholders_from_spaces(env.observation_space,
                                              env.action_space)
        # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features
        mask_ph = placeholder(env.action_space.n)
        adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None)

        # Main outputs from computation graph
        pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph,
                                                 **ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]

        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

        # Experience buffer

        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        # PPO objectives
        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                           (1 - clip_ratio) * adv_ph)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
        v_loss = tf.reduce_mean((ret_ph - v)**2)

        # Info (useful to watch during learning)
        approx_kl = tf.reduce_mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute
        approx_ent = tf.reduce_mean(
            -logp)  # a sample estimate for entropy, also easy to compute
        clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio <
                                (1 - clip_ratio))
        clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

        # Optimizers
        train_pi = tf.train.AdamOptimizer(
            learning_rate=pi_lr).minimize(pi_loss)
        train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        tf.add_to_collection("train_pi", train_pi)
        tf.add_to_collection("train_v", train_v)

    # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v})
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph,
                              'adv': adv_ph,
                              'mask': mask_ph,
                              'ret': ret_ph,
                              'logp_old_ph': logp_old_ph
                          },
                          outputs={
                              'pi': pi,
                              'v': v,
                              'out': out,
                              'pi_loss': pi_loss,
                              'logp': logp,
                              'logp_pi': logp_pi,
                              'v_loss': v_loss,
                              'approx_ent': approx_ent,
                              'approx_kl': approx_kl,
                              'clipped': clipped,
                              'clipfrac': clipfrac
                          })

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
    ), 0, False, 0, 0, 0, 0, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for epoch in range(epochs):
        t = 0
        while True:
            # [no_skip, skip]
            lst = [1, 1]
            #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES):
            #    job = o[i:i + JOB_FEATURES]
            #    # the skip time of will_skip job exceeds MAX_SKIP_TIME
            #    if job[-2] == 1.0:
            #        lst = [1,0]

            a, v_t, logp_t, output = sess.run(get_action_ops,
                                              feed_dict={
                                                  x_ph:
                                                  o.reshape(1, -1),
                                                  mask_ph:
                                                  np.array(lst).reshape(1, -1)
                                              })
            # print(a, end=" ")
            '''
            action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs)
            log_action_prob = np.log(action_probs[action])
            '''

            # save and log
            buf.store(o, None, a, np.array(lst), r, v_t, logp_t)
            logger.store(VVals=v_t)
            if a[0] == 1:
                skip_count += 1
            o, r, d, r2, sjf_t, f1_t = env.step(a[0])
            ep_ret += r
            ep_len += 1
            show_ret += r2
            sjf += sjf_t
            f1 += f1_t

            if d:
                t += 1
                buf.finish_path(r)
                logger.store(EpRet=ep_ret,
                             EpLen=ep_len,
                             ShowRet=show_ret,
                             SJF=sjf,
                             F1=f1,
                             SkipRatio=skip_count / ep_len)
                [
                    o, co
                ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
                ), 0, False, 0, 0, 0, 0, 0, 0
                if t >= traj_per_epoch:
                    # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action)
                    break
        # print("Sample time:", (time.time()-start_time)/num_total, num_total)
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # start_time = time.time()
        update()
        # print("Train time:", time.time()-start_time)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', with_min_and_max=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts',
                           (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('ShowRet', average_only=True)
        logger.log_tabular('SJF', average_only=True)
        logger.log_tabular('F1', average_only=True)
        logger.log_tabular('SkipRatio', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()