コード例 #1
0
def asac(env_fn, actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(), seed=0,
         steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000,
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001,
         delta=0.02, sample_step=2000):

    alpha = Alpha(alpha_start=alpha_start, delta=delta)
    alpha_t = alpha()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None)
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    alpha_ph = core.scale_holder()
    # Main outputs from computation graph

    #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target value network
    with tf.variable_scope('target'):
        _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts)
    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi)
    Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ)
    R_backup = tf.stop_gradient(Q_pi)
    adv = Q_pi - R

    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2)
    R_loss = 0.5*tf.reduce_mean((R_backup - R)**2)
    value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss
    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
    """
    R_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R'))
    """
    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
                train_pi_op, train_value_op, target_update, R_loss, Q_loss]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                          outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
    total_steps = steps_per_epoch * epochs

    counter = 0
    ret_epi = []
    obs_epi = []
    loss_old = 10000
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             alpha_ph: alpha_t
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                             VVals=outs[6], LogPi=outs[7], LossR=outs[11])
                counter += 1
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
            logger.store(RetEst=ret_est)
            if counter >= 1000:
                loss_new, _ = logger.get_stats('LossPi')
                counter = 0
                if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps:
                    rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32)
                    rho_ptr = 0
                    for sample_t in range(sample_step):
                        a = get_action(o)
                        o2, r, d, _ = env.step(a)
                        ep_len += 1
                        d = False if ep_len == max_ep_len else d
                        rho_s[rho_ptr] = o
                        o = o2
                        if d or (ep_len == max_ep_len):
                            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    advantages = sess.run(adv, feed_dict={x_ph: rho_s})
                    alpha.update_alpha(advantages)
                    #alpha.update_alpha(rho_q-rho_v)
                    alpha_t = alpha()
                    print(alpha_t)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    loss_old = 10000
                else:
                    loss_old = loss_new
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EntCoeff', alpha_t)
            logger.log_tabular('RetEst', average_only=True)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossR', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #2
0
def sac1_rnn(args, env_fn, actor_critic=core.mlp_actor_critic, sac1_dynamic_rnn=core.sac1_dynamic_rnn,
             ac_kwargs=dict(), seed=0, Lb=10, Lt=10, hc_dim=128, steps_per_epoch=3000, epochs=100,
             replay_size=int(5e5), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2,
             h0=1.0, batch_size=150, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000,
             logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn('train'), env_fn('test')
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    ######################################
    # Inputs to computation graph
    # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    #
    # # Main outputs from computation graph
    # with tf.variable_scope('main'):
    #     mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
    #
    # # Target value network
    # with tf.variable_scope('target'):
    #     _, _, logp_pi_, _, _, q1_pi_, q2_pi_ = actor_critic(x2_ph, a_ph, **ac_kwargs)
    #
    ######################################

    obs_ph, hc_ph = core.placeholders((Lb + Lt + 1, obs_dim), (hc_dim,))
    a_ph_all, r_ph_all, d_ph_all, data01_ph = core.placeholders((Lb + Lt, act_dim), (Lb + Lt,), (Lb + Lt,), (Lb + Lt,))

    obs_burn = obs_ph[:, :Lb]
    obs_train = obs_ph[:, Lb:]

    obs12_train = data01_ph[:, Lb:]
    # obs12_train = tf.transpose(obs12_train, perm=[1, 0])

    a_ph = a_ph_all[:, Lb:]
    r_ph = r_ph_all[:, Lb:]
    d_ph = d_ph_all[:, Lb:]

    _, state_burn_in = sac1_dynamic_rnn(obs_burn, hc_ph)
    state_burn_in = tf.stop_gradient(state_burn_in) * data01_ph[:, 0][..., tf.newaxis]
    s_outputs, _ = sac1_dynamic_rnn(obs_train, state_burn_in)
    s_ph = s_outputs[:, :-1]
    s2_ph = s_outputs[:, 1:]

    logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = [None, ] * Lt, [None, ] * Lt, [None, ] * Lt, \
                                              [None, ] * Lt, [None, ] * Lt, [None, ] * Lt
    logp_pi_, q1_pi_, q2_pi_ = [None, ] * Lt, [None, ] * Lt, [None, ] * Lt

    for i in range(Lt):
        # Main outputs from computation graph
        with tf.variable_scope('main', reuse=tf.AUTO_REUSE):

    ######################################
            _, _, logp_pi[i], logp_pi2[i], q1[i], q2[i], q1_pi[i], q2_pi[i] = actor_critic(s_ph[:, i],
                                                                                           s2_ph[:, i],
                                                                                           a_ph[:, i],
                                                                                           **ac_kwargs)
        # Target value network
        with tf.variable_scope('target', reuse=tf.AUTO_REUSE):
            _, _, logp_pi_[i], _, _, _, q1_pi_[i], q2_pi_[i] = actor_critic(s2_ph[:, i], s2_ph[:, i], a_ph[:, i],
                                                                            **ac_kwargs)

    logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = tf.stack(logp_pi, axis=1), tf.stack(logp_pi2, axis=1), \
                            tf.stack(q1, axis=1), tf.stack(q2, axis=1), tf.stack(q1_pi, axis=1), tf.stack(q2_pi, axis=1)
    logp_pi_, q1_pi_, q2_pi_ = tf.stack(logp_pi_, axis=1), tf.stack(q1_pi_, axis=1), tf.stack(q2_pi_, axis=1)

    ######################################

    # Experience buffer
    # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
    replay_buffer_rnn = ReplayBuffer_RNN(Lb=Lb, Lt=Lt, hc_dim=hc_dim, obs_dim=obs_dim, act_dim=act_dim,
                                         size=replay_size)
    # Count variables
    # var_counts = tuple(core.count_vars(scope) for scope in
    #                    ['main/pi', 'main/q1', 'main/q2', 'rnn'])
    # print(('\nNumber of parameters: \t pi: %d, \t' + 'q1: %d, \t q2: %d, \t rnn: %d\n') % var_counts)
    # print('Number of parameters: \t Total: %d\n' % sum(var_counts))

    ######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * h0, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
    ######

    # Min Double-Q:
    min_q_pi_ = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi_ - alpha * logp_pi2)
    q_backup = r_ph + gamma * (1 - d_ph) * v_backup

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(obs12_train * (alpha * logp_pi - q1_pi))
    q1_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q1) ** 2)
    q2_loss = 0.5 * tf.reduce_mean(obs12_train * (q_backup - q2) ** 2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    pi_params = get_vars('main/pi')
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=pi_params)

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('rnn')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha),
                    train_pi_op, train_value_op, target_update]
    else:
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha,
                    train_pi_op, train_value_op, target_update, train_alpha_op]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Inputs to computation graph
    x_ph_geta, hc_ph_geta, a_ph_geta = core.placeholders((1, obs_dim), hc_dim, act_dim)

    s_geta, hc_geta = sac1_dynamic_rnn(x_ph_geta, hc_ph_geta)
    # Main outputs from computation graph
    with tf.variable_scope('main', reuse=tf.AUTO_REUSE):
        mu, pi, _, _, _, _, _, _ = actor_critic(s_geta[:, 0], s_geta[:, 0], a_ph_geta, **ac_kwargs)

    # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
    #                       outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})
    saver = tf.train.Saver()

    checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints'
    if not os.path.exists(checkpoint_path):
         os.makedirs(checkpoint_path)

    if args.is_test or args.is_restore_train:
         ckpt = tf.train.get_checkpoint_state(checkpoint_path)
         if ckpt and ckpt.model_checkpoint_path:
             saver.restore(sess, ckpt.model_checkpoint_path)
             print("Model restored.")
    # def get_action(o, deterministic=False):
    #     act_op = mu if deterministic else pi
    #     return sess.run(act_op, feed_dict={x_ph_geta: o.reshape(1, -1)})[0]#[0]

    def get_action(o, hc_0, deterministic=False):
        """s_t_0_  starting step for testing 1 H"""

        act_op = mu if deterministic else pi
        action, hc_1 = sess.run([act_op, hc_geta], feed_dict={x_ph_geta: o.reshape(1, 1, obs_dim),
                                                              hc_ph_geta: hc_0})
        # time.sleep(0.001)
        return action[0], hc_1
        ##############################  test  ############################

    if args.is_test:
        test_env = gym.make(args.env)
        ave_ep_ret = 0
        for j in range(10000):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not d:  # (d or (ep_len == 2000)):
                o, r, d, _ = test_env.step(get_action(o))
                ep_ret += r
                ep_len += 1
                if args.test_render:
                    test_env.render()
            ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
            print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1))
        return

        ##############################  train  ############################
    def test_agent(n=5):
        # print('test')
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            hc_run_test = np.zeros((1, hc_dim,), dtype=np.float32)
            while not (d or (ep_len == max_ep_len_test)):
                # Take deterministic actions at test time
                a_test, hc_run_test = get_action(o, hc_run_test, True)
                o, r, d, _ = test_env.step(a_test)
                time.sleep(0.001)
                ep_ret += r
                ep_len += 1
                # test_env.render()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    ################################## deques

    obs_hc_queue = deque([], maxlen=Lb + Lt + 1)
    a_r_d_data01_queue = deque([], maxlen=Lb + Lt)

    ################################## deques

    start_time = time.time()

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    ################################## deques reset
    t_queue = 1
    hc_run = np.zeros((1, hc_dim,), dtype=np.float32)
    for _i in range(Lb):
        obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32)))
        a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False))
    obs_hc_queue.append((o, hc_run[0]))

    ################################## deques reset

    total_steps = steps_per_epoch * epochs

#    test_ep_ret = test_ep_ret_1 = -10000.0
    test_ep_ret_best = test_ep_ret = -10000.0
    # Main loop: collect experience in env and update/log each epoch
    start = time.time()
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a, hc_run = get_action(o, hc_run)
        else:
            _, hc_run = get_action(o, hc_run)
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        # d = False if ep_len==max_ep_len_train else d

        # Store experience to replay buffer
        # replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        #################################### deques store

        a_r_d_data01_queue.append((a, r, d, True))
        obs_hc_queue.append((o2, hc_run[0]))

        if t_queue % Lt == 0:
            replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue)

        if (d or (ep_len == max_ep_len_train)) and t_queue % Lt != 0:
            for _0 in range(Lt - t_queue % Lt):
                a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False))
                obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32)))
            replay_buffer_rnn.store(obs_hc_queue, a_r_d_data01_queue)

        t_queue += 1

        #################################### deques store

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len_train):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer_rnn.sample_batch(batch_size)
                feed_dict = {obs_ph: batch['obs'],
                             hc_ph: batch['hc'],
                             a_ph_all: batch['acts'],
                             r_ph_all: batch['rews'],
                             d_ph_all: batch['done'],
                             data01_ph: batch['data01']
                             }
                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3][:, 0],
                             Q2Vals=outs[4][:, 0],
                             LogPi=outs[5][:, 0],
                             Alpha=outs[6])

            logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len)
            print("ep_len", ep_len, "time", time.time() - start)
            start = time.time()
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            ################################## deques reset
            t_queue = 1
            hc_run = np.zeros((1, hc_dim,), dtype=np.float32)
            for _i in range(Lb):
                obs_hc_queue.append((np.zeros((obs_dim,), dtype=np.float32), np.zeros((hc_dim,), dtype=np.float32)))
                a_r_d_data01_queue.append((np.zeros((act_dim,), dtype=np.float32), 0.0, False, False))
            obs_hc_queue.append((o, hc_run[0]))

            ################################## deques reset

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            if epoch < 2000:
                test_agent(25)
                # test_ep_ret = logger.get_stats('TestEpRet')[0]
                # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)
            else:
                test_agent(25)
                test_ep_ret = logger.get_stats('TestEpRet')[0]
                # logger.epoch_dict['TestEpRet'] = []
                print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)
            # test_agent(25)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('Name', name)
            logger.log_tabular('EpRet', with_min_and_max=True)

            logger.log_tabular('TestEpRet', with_min_and_max=True)
            # test_ep_ret_1 = logger.get_stats('TestEpRet')[0]

            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=False)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Save model
            if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best:
                save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t)
                print("Model saved in path: %s" % save_path)
                test_ep_ret_best = test_ep_ret
コード例 #3
0
ファイル: ddpg.py プロジェクト: T3p/spinningup
def ddpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, 
         polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
         update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1, info_key='danger',
         init=None):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of 
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
        
        info_key (str): key of the info to log (sum of infos)

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs, init=init)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n'%var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q = ac.q(o,a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort 
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len, tr_info, it_info = env.reset(), 0, 0, 0, 0
    eps_per_iter = 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy (with some noise, via act_noise). 
        #if t > start_steps:
        a = get_action(o, act_noise)
        #else: #HELL NO
            #a = env.action_space.sample()

        # Step the env
        #print(o, a)
        o2, r, d, info = env.step(a)
        #print(o, a, ac.act(torch.as_tensor(o, dtype=torch.float32)), o2, info[info_key])
        ep_ret += r
        ep_len += 1
        tr_info = max(info[info_key], tr_info)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)            
            it_info += tr_info
            o, ep_ret, ep_len, tr_info = env.reset(), 0, 0, 0
            eps_per_iter += 1

        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0:
            epoch = (t+1) // steps_per_epoch
            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            perf = logger.get_stats('EpRet')[0]
            for name, param in ac.pi.named_parameters():
                if param.requires_grad:
                    print('Policy params:', param.data)
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.log_tabular('Info', it_info / eps_per_iter)
            logger.log_tabular('fail', it_info)
            logger.log_tabular('perf', perf)
            logger.dump_tabular()
            it_info = 0
            eps_per_iter = 0
    return perf
コード例 #4
0
def sac1(args,
         env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(2e6),
         gamma=0.99,
         reward_scale=1.0,
         polyak=0.995,
         lr=5e-4,
         alpha=0.2,
         batch_size=200,
         start_steps=10000,
         max_ep_len_train=1000,
         max_ep_len_test=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    if not args.is_test:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(3), env_fn(1)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic(
            x_ph, x2_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic(
            x2_ph, x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

    ######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha *
                                    tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])


######

# Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
    q_backup = r_ph + gamma * (1 - d_ph) * v_backup

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
            tf.identity(alpha), train_pi_op, train_value_op, target_update
        ]
    else:
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op,
            train_value_op, target_update, train_alpha_op
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    ##############################  save and restore  ############################

    saver = tf.train.Saver()

    checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints'
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    if args.is_test or args.is_restore_train:
        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("Model restored.")

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    ##############################  test  ############################

    if args.is_test:
        test_env = gym.make(args.env)
        ave_ep_ret = 0
        for j in range(10000):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not d:  # (d or (ep_len == 2000)):
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                if args.test_render:
                    test_env.render()
            ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
            print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:',
                  ave_ep_ret, '({}/10000)'.format(j + 1))
        return

    ##############################  train  ############################

    def test_agent(n=25):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len_test)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                # test_env.render()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    ep_index = 0
    test_ep_ret_best = test_ep_ret = -10000.0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        # d = False if ep_len==max_ep_len_train else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len_train):
            ep_index += 1
            print('episode: {}, reward: {}'.format(ep_index,
                                                   ep_ret / reward_scale))
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(int(1.5 * ep_len)):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             LogPi=outs[5],
                             Alpha=outs[6])

            logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            test_agent(10)
            # test_ep_ret = logger.get_stats('TestEpRet')[0]
            # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)
            if logger.get_stats('TestEpRet')[0] >= 280:
                print('Recalculating TestEpRet...')
                test_agent(100)
                test_ep_ret = logger.get_stats('TestEpRet')[0]
                # logger.epoch_dict['TestEpRet'] = []
                if test_ep_ret >= 300:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(ep_index, test_ep_ret))
                    exit()
                print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('Num_Ep', ep_index)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=False)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Save model
            if ((epoch % save_freq == 0) or
                (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best:
                save_path = saver.save(sess, checkpoint_path + '/model.ckpt',
                                       t)
                print("Model saved in path: %s" % save_path)
                test_ep_ret_best = test_ep_ret
コード例 #5
0
def maxsqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=200, start_steps=5000,
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    # print(max_ep_len,type(max_ep_len))
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)


    env, test_env = env_fn(3), env_fn(1)
    obs_dim = env.observation_space.shape[0]
    obs_space = env.observation_space
    act_dim = env.action_space.n
    act_space = env.action_space


    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None)


    ######
    if alpha == 'auto':
        # target_entropy = (-np.prod(env.action_space.n))
        # target_entropy = (np.prod(env.action_space.n))/4/10
        target_entropy = 0.35

        log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
        alpha = tf.exp(log_alpha)
    ######


    # Main outputs from computation graph
    with tf.variable_scope('main'):
        v_x, mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu = actor_critic(x_ph,x2_ph, a_ph, alpha, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, logp_pi_, _,  _, _,q1_pi_, q2_pi_,q1_mu_, q2_mu_= actor_critic(x2_ph, x2_ph,a_ph, alpha,  **ac_kwargs)

    # Experience buffer
    if isinstance(act_space, Box):
        a_dim = act_dim
    elif isinstance(act_space, Discrete):
        a_dim = 1
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)


######
    if isinstance(alpha,tf.Tensor):
        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi_ + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)
    # min_q_pi = tf.minimum(q1_mu_, q2_mu_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi)# - alpha * logp_pi2)  ############################## alpha=0
    q_backup = r_ph + gamma*(1-d_ph)*v_backup


    # Soft actor-critic losses
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # # Policy train op
    # # (has to be separate from value train op, because q1_pi appears in pi_loss)
    # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    #with tf.control_dependencies([train_pi_op]):
    train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha),
                train_value_op, target_update]
    else:
        step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, alpha,
                train_value_op, target_update, train_alpha_op]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})

    def get_action0(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0]

    def entropy(logits, alpha):
        logps = log_softmax(np.array(logits) / alpha)
        return -np.sum(np.exp(logps) * logps)

    # entf = lambda x, logits: x * entropy(logits, x) - 0.3
    entf = lambda x, logits: entropy(logits, x) - 0.35

    def softmax_ud(logits):
        logps = log_softmax(np.array(logits))
        return np.exp(logps)

    def get_action(o, deterministic=False):
        if deterministic:
            return sess.run(mu, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0]
        else:
            q_logits = sess.run(v_x, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0]
            try:
                alpha_ad = optimize.bisect(entf, args=q_logits,a=0.0001, b=10000, xtol=1e-4, rtol=1e-5)
            except:
                print(q_logits)
                alpha_ad = 0.0001
            # alpha_ad = min(1.0, alpha_ad)
            # print(alpha_ad)
            # print(alpha_ad * entropy(q_logits, alpha_ad))
            return np.random.choice(act_dim, 1, p=softmax_ud(q_logits/alpha_ad))[0]

    def test_agent(n=20):  # n: number of tests
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):  # max_ep_len
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()


    # o = env.reset()                                                     #####################
    # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0            #####################
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0



    total_steps = steps_per_epoch * epochs

    ep_index = 0
    test_ep_ret = 0.0
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        np.random.random()


        # Step the env
        o2, r, d, _ = env.step(a)
        #print(a,o2)
        # o2, r, _, d = env.step(a)                     #####################
        # d = d['ale.lives'] < 5                        #####################

        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):   # make sure: max_ep_len < steps_per_epoch
            ep_index += 1
            print('episode: {}, ep_len: {}, reward: {}'.format(ep_index, ep_len, ep_ret))
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(int(1.5*ep_len)):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                            }
                # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossQ1=outs[0], LossQ2=outs[1],
                            Q1Vals=outs[2], Q2Vals=outs[3],
                            LogPi=outs[4], Alpha=outs[5])

            #if d:
            logger.store(EpRet=ep_ret, EpLen=ep_len)


            # o = env.reset()                                              #####################
            # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0     #####################
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0



        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # # Save model
            # if (epoch % save_freq == 0) or (epoch == epochs-1):
            #     logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.

            test_agent(10)
            if logger.get_stats('TestEpRet')[0] >= 190:
                print('Recalculating TestEpRet...')
                # test_agent(100)
                # test_ep_ret = logger.get_stats('TestEpRet')[0]

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha',average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            # logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #6
0
def ppo(env,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=2048,
        epochs=250,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=100,
        train_v_iters=70,
        lam=0.95,
        max_ep_len=512,
        target_kl=0.005,
        logger_kwargs=dict(),
        save_freq=5):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env(
        "PandaPegIn",
        has_offscreen_renderer=True,
        # has_renderer=True,
        use_camera_obs=False,
        control_freq=100,
    )

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    #加载预训练模型
    # fname = "data/ppo_peg_in_add_delta_pos_plus_plus/ppo_peg_in_add_delta_pos_plus_plus_s0/pyt_save/model24.pt"
    # pre_model = torch.load(fname)
    # ac.pi = pre_model.pi
    # ac.v =pre_model.v

    #使用TensorboardX
    writer = logger.create_writer()
    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(
            obs, act
        )  #变化的是网络pi   data['obs'],data['act'],data['adv'],data['logp']在回合内未变
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info  #需要最小化loss_pi,pi_info包括kl散度、熵、越界程度(都针对单个回合)

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()  #一次更新改变一次data

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):  #在kl散度不超标的前提下尽可能减小损失
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()
            # print(i,':',loss_v)
            # print('='*20)

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    #运动到初始位置
    pre_action = [0, 0, 0]
    for i in range(4):
        o, _, _, _ = env.step(pre_action)

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        print("epoch:", epoch)
        for t in range(local_steps_per_epoch):
            # if( t == steps_per_epoch/2 ):
            # print("Half finished!")
            #通过policy网络和值函数网络计算出:动作、值函数和采取这个动作的概率
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r  #单次游戏回报
            ep_len += 1  #单次游戏时长

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:  #game die;   game超出时间;   回合结束
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)  #计算GAE和rewards-to-go

                # print("steps:",t)
                # print("done",d)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, epoch)

        # Perform PPO update!
        update()

        #将数据写入TensorboardX
        stats_to_write = logger.get_stats('EpRet')
        writer.add_scalar('AverageEpRet',
                          stats_to_write[0],
                          global_step=(epoch + 1) * 2048)

        # Log info about epoch  一个回合的数据
        logger.log_tabular('Epoch', epoch)  #第几个回合
        logger.log_tabular('EpRet',
                           with_min_and_max=True)  #回报的最大、最小、平均值,游戏结束时停留的状态的回报
        logger.log_tabular('EpLen', average_only=True)  #单次游戏长度的平均值
        logger.log_tabular('VVals', with_min_and_max=True)  #值函数的最大、最小、平均值
        logger.log_tabular('TotalEnvInteracts',
                           (epoch + 1) * steps_per_epoch)  #目前总步数
        logger.log_tabular('LossPi', average_only=True)  #回合开始时策略网络的损失
        logger.log_tabular('LossV', average_only=True)  #回合开始时值网络的损失
        logger.log_tabular('DeltaLossPi', average_only=True)  #策略网络回合结束损失-开始损失
        logger.log_tabular('DeltaLossV', average_only=True)  #值略网络回合结束损失-开始损失
        logger.log_tabular('Entropy', average_only=True)  #?
        logger.log_tabular('KL', average_only=True)  #散度值
        logger.log_tabular('ClipFrac', average_only=True)  #越界程度
        logger.log_tabular('StopIter', average_only=True)  #ppo策略网络迭代次数
        logger.log_tabular('Time', time.time() - start_time)  #回合时间
        logger.dump_tabular()


# if __name__ == '__main__':
#     import argparse
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
#     parser.add_argument('--hid', type=int, default=64)
#     parser.add_argument('--l', type=int, default=2)
#     parser.add_argument('--gamma', type=float, default=0.99)
#     parser.add_argument('--seed', '-s', type=int, default=0)
#     parser.add_argument('--cpu', type=int, default=1)
#     parser.add_argument('--steps', type=int, default=4000)
#     parser.add_argument('--epochs', type=int, default=50)
#     parser.add_argument('--exp_name', type=str, default='ppo')
#     args = parser.parse_args()

#     mpi_fork(args.cpu)  # run parallel code with mpi

#     from spinup.utils.run_utils import setup_logger_kwargs
#     logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed)

#     ppo(lambda : gym.make(args.env), actor_critic=core.MLPActorCritic,
#         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma,
#         seed=args.seed, steps_per_epoch=args.steps, epochs=args.epochs,
#         logger_kwargs=logger_kwargs)
コード例 #7
0
ファイル: dqn.py プロジェクト: dennischenfeng/spinningup
def dqn(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100,
        replay_size=int(1e5), batch_size=100, gamma=0.99, q_lr=1e-4, start_steps=10000,
        update_after=1000, update_targ_every=50, num_test_episodes=10,
        max_ep_len=1000, epsilon=0.01, epsilon_decay=0.99995, logger_kwargs=dict(), writer_kwargs=dict(), save_freq=1):
    """
    DQN (Deep Q-Networks). Reproduce the original paper from Minh et al.
    """
    # Instantiate env
    env = env_fn()
    test_env = env_fn()
    # TODO: might have to assert discrete, or otherwise take only first index of shape or so
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Set up actor (pi) & critic (Q), and data buffer
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    q_targ = copy.deepcopy(ac.q)
    for p in q_targ.parameters():
        p.requires_grad = False
    q_optimizer = torch.optim.Adam(ac.q.parameters(), lr=q_lr)

    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Set RNG seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.seed(seed)

    # Set up logging
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    logger.setup_pytorch_saver(ac)
    writer = SummaryWriter(**writer_kwargs)

    start_time = time.time()
    total_steps = epochs * steps_per_epoch
    o = env.reset()
    op = preprocess_obs(o)  # "op" = "observation_preprocessed"
    ep_return = 0  # episode return, counter
    ep_length = 0  # episode length, counter
    for step in range(total_steps):
        # Take an env step, then store data in replay buffer
        if step > start_steps:
            ac.pi.epsilon = max(epsilon, epsilon_decay**step)
            a = ac.act(torch.as_tensor(op, dtype=torch.float32))
        else:
            a = env.action_space.sample()
        o2, r, d, _ = env.step(a)
        o2p = preprocess_obs(o2)
        replay_buffer.store(op, a, r, o2p, d)

        # TODO: does DQN paper say to do 1 GD update with mean of minibatch, or many 1-data-point updates?
        # Sample a random batch from replay buffer and perform one GD step
        q_optimizer.zero_grad()
        batch_data = replay_buffer.sample_batch(batch_size)
        loss_q = compute_loss_q(batch_data, ac, q_targ, gamma)
        loss_q.backward()
        q_optimizer.step()

        # Update target network every so often
        if (step % update_targ_every == 0) and (step >= update_after):
            q_targ = copy.deepcopy(ac.q)
            for p in q_targ.parameters():
                p.requires_grad = False

        # Keep track of episode return and length (for logging purposes)
        ep_return += r
        ep_length += 1

        # If episode done, reset env
        if d:
            o = env.reset()
            op = preprocess_obs(o)
            logger.store(EpRet=ep_return, EpLen=ep_length)
            ep_return = 0
            ep_length = 0
        else:
            op = o2p

        # TODO: confirm: no need for test set if test agent & env are same as training agent & env (e.g. would need
        #  test set if algo added noise to training but not test
        # If epoch end, then do a test to see average return thus far
        if step % steps_per_epoch == steps_per_epoch - 1:
            for ep_i in range(num_test_episodes):
                # turn off epsilon exploration:
                old_epsilon = ac.pi.epsilon
                ac.pi.epsilon = 0

                test_ep_return, test_ep_length = run_test_episode(test_env, ac)
                logger.store(TestEpRet=test_ep_return, TestEpLen=test_ep_length)

                # turn it back on
                ac.pi.epsilon = old_epsilon

        # If epoch end, save models and show logged data
        if step % steps_per_epoch == steps_per_epoch - 1:
            epoch_i = int(step // steps_per_epoch)

            writer.add_scalar("EpRet_mean", logger.get_stats("EpRet")[0], epoch_i)  # first item in `get_stats` is mean
            writer.add_scalar("EpRet_std", logger.get_stats("EpRet")[1], epoch_i)  # 2nd item in `get_stats` is std
            writer.add_scalar("TestEpRet_mean", logger.get_stats("TestEpRet")[0], epoch_i)
            writer.add_scalar("TestEpRet_std", logger.get_stats("TestEpRet")[1], epoch_i)
            writer.add_scalar("epsilon", ac.pi.epsilon, epoch_i)

            logger.save_state({'env': env}, None)  # saves both ac and env
            logger.log_tabular("Epoch", epoch_i)
            logger.log_tabular("EpRet", with_min_and_max=True)
            logger.log_tabular("EpLen", average_only=True)
            logger.log_tabular("TestEpRet", with_min_and_max=True)
            logger.log_tabular("TestEpLen", average_only=True)
            logger.log_tabular("TimeFromStart", time.time() - start_time)
            logger.dump_tabular()

    # Save model at end
    logger.save_state({'env': env}, None)
    writer.close()
コード例 #8
0
ファイル: sac.py プロジェクト: lukashermann/spinningup
def sac(env_fn,
        actor_critic=CNNActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e4),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=512,
        start_steps=10000,
        update_after=100,
        update_every=1,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        device='cuda'):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tb_writer = SummaryWriter(log_dir=logger_kwargs['output_dir'])

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    env = PyTorchWrapper(
        DictToBoxWrapper(
            DictTransposeImage(
                CurriculumWrapper(env,
                                  epochs,
                                  steps_per_epoch,
                                  tb_writer=tb_writer))), device)
    test_env = PyTorchWrapper(DictToBoxWrapper(DictTransposeImage(env_fn())),
                              device)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)
    ac.to(device)
    ac_targ.to(device)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32, device=device),
                      deterministic)

    def test_agent():
        eval_episode_returns = []
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            eval_episode_returns.append(ep_ret)
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
        tb_writer.add_scalar("eval_eprewmean_updates",
                             np.mean(eval_episode_returns), epoch)
        tb_writer.add_scalar("eval_eprewmean_steps",
                             np.mean(eval_episode_returns),
                             (epoch + 1) * steps_per_epoch)
        tb_writer.add_scalar(
            "eval_success_rate",
            np.mean(np.array(eval_episode_returns) > 0).astype(np.float),
            (epoch + 1) * steps_per_epoch)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Tensorboard log
            try:
                tb_writer.add_scalar("eprewmean_updates",
                                     logger.get_stats('EpRet')[0], epoch)
                tb_writer.add_scalar("eprewmean_steps",
                                     logger.get_stats('EpRet')[0],
                                     (epoch + 1) * steps_per_epoch)
            except IndexError:
                pass
            tb_writer.flush()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #9
0
def ppo(env_fn,
        actor_critic=ImgStateActorCriticDictBox,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4096,
        epochs=2441,
        gamma=0.99,
        clip_ratio=0.1,
        pi_lr=2.5e-4,
        vf_lr=2.5e-3,
        train_pi_iters=16,
        train_v_iters=16,
        lam=0.95,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=300,
        eval_interval=20,
        num_eval_episodes=32,
        device='cuda',
        linear_lr_decay=True,
        linear_clip_decay=True):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tb_writer = SummaryWriter(log_dir=logger_kwargs['output_dir'])

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    initial_clip_ratio = clip_ratio
    local_steps_per_epoch = int(steps_per_epoch / num_procs())

    # Instantiate environment
    env = env_fn()
    env = PyTorchWrapper(
        DictToBoxWrapper(
            DictTransposeImage(
                CurriculumWrapper(env,
                                  epochs,
                                  local_steps_per_epoch,
                                  tb_writer=tb_writer))), device)
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac.to(device)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(
        core.count_vars(module) for module in [ac.cnn, ac.pi_, ac.v_])
    logger.log('\nNumber of parameters: \t cnn: %d, \t pi: %d, \t v: %d\n' %
               var_counts)

    # Set up experience buffer

    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam,
                    device)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        if linear_clip_decay:
            clip_ratio = initial_clip_ratio * (1 - epoch / epochs)
            print(clip_ratio)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(list(ac.cnn.parameters()) + list(ac.pi_.parameters()),
                        lr=pi_lr)
    vf_optimizer = Adam(list(ac.cnn.parameters()) + list(ac.v_.parameters()),
                        lr=vf_lr)
    lr_decay = lambda e: 1 - e / epochs
    pi_lr_scheduler = LambdaLR(pi_optimizer, lr_decay)
    vf_lr_scheduler = LambdaLR(vf_optimizer, lr_decay)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.cnn)  # average grads across MPI processes
            mpi_avg_grads(ac.pi_)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.cnn)  # average grads across MPI processes
            mpi_avg_grads(ac.v_)  # average grads across MPI processes
            vf_optimizer.step()
        if linear_lr_decay:
            pi_lr_scheduler.step()
            vf_lr_scheduler.step()
        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    def eval_policy():
        eval_env = PyTorchWrapper(
            DictToBoxWrapper(DictTransposeImage(env_fn())), device)
        eval_episode_returns = []
        ob = eval_env.reset()
        eval_ep_ret = 0
        while len(eval_episode_returns) < num_eval_episodes:
            action = ac.act(ob, deterministic=True)
            ob, rew, done, _ = eval_env.step(action)
            eval_ep_ret += rew.cpu().numpy()
            if done:
                eval_episode_returns.append(eval_ep_ret)
                ob = eval_env.reset()
                eval_ep_ret = 0
        eval_env.close()
        tb_writer.add_scalar("eval_eprewmean_updates",
                             np.mean(eval_episode_returns), epoch)
        tb_writer.add_scalar("eval_eprewmean_steps",
                             np.mean(eval_episode_returns),
                             (epoch + 1) * steps_per_epoch)
        tb_writer.add_scalar(
            "eval_success_rate",
            np.mean(np.array(eval_episode_returns) > 0).astype(np.float),
            (epoch + 1) * steps_per_epoch)

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    actions = []
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(
                torch.as_tensor(o, dtype=torch.float32, device=device))
            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1
            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(
                        torch.as_tensor(o, dtype=torch.float32, device=device))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        if epoch % eval_interval == 0 or (epoch == epochs - 1):
            if epoch == epochs - 1:
                num_eval_episodes = 100
            eval_policy()

        # Tensorboard log
        try:
            tb_writer.add_scalar("eprewmean_updates",
                                 logger.get_stats('EpRet')[0], epoch)
            tb_writer.add_scalar("eprewmean_steps",
                                 logger.get_stats('EpRet')[0],
                                 (epoch + 1) * steps_per_epoch)
        except IndexError:
            pass
        tb_writer.flush()
        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #10
0
def sac(env_fn,
        env_name,
        test_env_fns=[],
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        load_dir=None,
        num_procs=1,
        clean_every=200):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of
            observations as inputs, and ``q1`` and ``q2`` should accept a batch
            of observations and a batch of actions as inputs. When called,
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    from spinup.examples.pytorch.eval_sac import load_pytorch_policy

    print(f"SAC proc_id {proc_id()}")
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    if proc_id() == 0:
        writer = SummaryWriter(log_dir=os.path.join(
            logger.output_dir, str(datetime.datetime.now())),
                               comment=logger_kwargs["exp_name"])

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)],
                        "spawn")
    test_env = SubprocVecEnv(test_env_fns, "spawn")
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    if load_dir is not None:
        _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False)
    else:
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing TD feats-losses
    def compute_loss_feats(data):
        o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done'], data["feats"]

        feats = torch.stack(list(feats.values())).T  # (nbatch, nfeats)
        feats1 = ac.q1.predict_feats(o, a)
        feats2 = ac.q2.predict_feats(o, a)

        feats_keys = replay_buffer.feats_keys

        # Bellman backup for feature functions
        with torch.no_grad():
            a2, _ = ac.pi(o2)

            # Target feature values
            feats1_targ = ac_targ.q1.predict_feats(o2, a2)
            feats2_targ = ac_targ.q2.predict_feats(o2, a2)
            feats_targ = torch.min(feats1_targ, feats2_targ)
            backup = feats + gamma * (1 - d[:, None]) * feats_targ

        # MSE loss against Bellman backup
        loss_feats1 = ((feats1 - backup)**2).mean(axis=0)
        loss_feats2 = ((feats2 - backup)**2).mean(axis=0)
        loss_feats = loss_feats1 + loss_feats2

        # Useful info for logging
        feats_info = dict(Feats1Vals=feats1.detach().numpy(),
                          Feats2Vals=feats2.detach().numpy())

        return loss_feats, feats_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, feats_keys):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        loss_feats, feats_info = compute_loss_feats(data)
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Feature loss
        keys = [f"LossFeats_{key}" for key in feats_keys]
        for key, val in zip(keys, loss_feats):
            logger.store(**dict(key, val.item()))

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent(feats_keys):
        num_envs = len(test_env_fns)
        env_ep_rets = np.zeros(num_envs)
        for j in range(num_test_episodes):
            o, d = test_env.reset(), np.zeros(num_envs, dtype=bool)
            ep_len = np.zeros(num_envs)
            while not (np.all(d) or np.all(ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, info = test_env.step(get_action(o, True))
                env_ep_rets += r
                ep_len += 1
            # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
        for ti in range(num_envs):
            logger.store(
                **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes})

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs)

    # Main loop: collect experience in env and update/log each epoch
    epoch = 0
    update_times, clean_times = 0, 0
    t = 0
    while t <= total_steps:
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = np.stack([env.action_space.sample() for _ in range(num_procs)])

        # Step the env
        o2, r, d, info = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        if np.all(ep_len == max_ep_len):
            d.fill(False)

        # Store experience to replay buffer
        replay_buffer.store_vec(o, a, r, o2, d,
                                [inf["features"] for inf in info])

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling, assumes all subenvs end at the same time
        if np.all(d) or np.all(ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            if clean_every > 0 and epoch // clean_every >= clean_times:
                env.close()
                test_env.close()
                env = SubprocVecEnv(
                    [partial(env_fn, rank=i) for i in range(num_procs)],
                    "spawn")
                test_env = SubprocVecEnv(test_env_fns, "spawn")
                clean_times += 1

            o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(
                num_procs)

        # Update handling
        if t >= update_after and t / update_every > update_times:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, feats_keys=replay_buffer.feats_keys)
            update_times += 1

        # End of epoch handling
        if t // steps_per_epoch > epoch:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                # try:
                logger.save_state({'env_name': env_name}, None)
                # logger.save_state({'env': env}, None)
                #except:
                #logger.save_state({'env_name': env_name}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(replay_buffer.feats_keys)

            # Update tensorboard
            if proc_id() == 0:
                log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [
                    f"TestEpRet_{ti}" for ti in range(len(test_env_fns))
                ]
                log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [
                    key
                    for key in logger.epoch_dict.keys() if "LossFeats" in key
                ]
                log_board = {
                    'Performance': log_perf_board,
                    'Loss': log_loss_board
                }
                for key, value in log_board.items():
                    for val in value:
                        mean, std = logger.get_stats(val)
                        if key == 'Performance':
                            writer.add_scalar(key + '/Average' + val, mean,
                                              epoch)
                            writer.add_scalar(key + '/Std' + val, std, epoch)
                        else:
                            writer.add_scalar(key + '/' + val, mean, epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            if proc_id() == 0:
                writer.flush()

                import psutil
                # gives a single float value
                cpu_percent = psutil.cpu_percent()
                # gives an object with many fields
                mem_percent = psutil.virtual_memory().percent
                print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%")
                cpu_separate = psutil.cpu_percent(percpu=True)
                for ci, cval in enumerate(cpu_separate):
                    print(f"\t cpu {ci}: {cval}%")
                # buf_size = replay_buffer.get_size()
                # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B")
        t += num_procs

    if proc_id() == 0:
        writer.close()