コード例 #1
0
ファイル: ude_ddpg.py プロジェクト: LinghengMeng/spinningup
def ude_ddpg(env_fn,
             actor_critic=core.mlp_actor_critic,
             ac_kwargs=dict(),
             seed=0,
             steps_per_epoch=5000,
             epochs=100,
             replay_size=int(1e6),
             gamma=0.99,
             polyak=0.995,
             pi_lr=1e-3,
             q_lr=1e-3,
             batch_size=100,
             start_steps=10000,
             act_noise=0.1,
             policy_delay=2,
             max_ep_len=1000,
             n_post_action=10,
             sample_action_with_dropout=True,
             dropout_rate=0.1,
             action_choose_method='random_sample',
             uncertainty_noise_type='std_noise',
             a_var_clip_max=1,
             a_var_clip_min=0.1,
             a_std_clip_max=1,
             a_std_clip_min=0.1,
             logger_kwargs=dict(),
             save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, pi_post_samplers, q, q_pi = actor_critic(x_ph,
                                                     a_ph,
                                                     **ac_kwargs,
                                                     create_post_samplers=True,
                                                     n_post=n_post_action,
                                                     dropout_rate=dropout_rate)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, pi_targ_post_samplers, _, q_pi_targ = actor_critic(
            x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })

    def get_action_test(o):
        """Get action in test phase."""
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        return np.clip(a, -act_limit, act_limit)

    def get_action_train(o):
        """Get action in training phase"""
        a_var_uncertainty = 0
        a_var_uncertainty_clipped = 0
        a_std_uncertainty = 0
        a_std_uncertainty_clipped = 0
        if sample_action_with_dropout:
            # Collect post samples into a ndarray of size (n_post, act_dim)
            pi_weights = sess.run(
                get_vars('main/pi'))  # Get current policy weights
            a_post = np.array(
                ray.get([
                    p_s.sample_action.remote(pi_weights, o)
                    for p_s in pi_post_samplers
                ]))

            # TODO: var and std must been scaled or clipped.
            #  Otherwise, a huge variance will always cause action out of act_lim and then be clipped to -1 or 1.
            #  we also need to set a lower bound to enforce a minimum exploration
            a_mean = np.mean(a_post, axis=0)
            a_median = np.median(a_post, axis=0)

            a_var = np.var(a_post, axis=0)
            a_var_clipped = np.clip(a_var, a_var_clip_min, a_var_clip_max)
            a_var_noise = a_var_clipped * np.random.randn(act_dim)

            a_std = np.std(a_post, axis=0)
            a_std_clipped = np.clip(a_std, a_std_clip_min, a_std_clip_max)
            a_std_noise = a_std_clipped * np.random.randn(act_dim)
            # TODO: define uncertainty to a value that is not affect by action dimension.
            a_var_uncertainty = np.mean(a_var)  # np.sum(a_var)
            a_var_uncertainty_clipped = np.mean(
                a_var_clipped)  # np.sum(a_var_clipped)
            a_std_uncertainty = np.mean(a_std)  # np.sum(a_std)
            a_std_uncertainty_clipped = np.mean(
                a_std_clipped)  # np.sum(a_std_clipped)

            # TODO: clip noise within a range. Maybe not necessary.
            if uncertainty_noise_type == 'var_noise':
                noise = a_var_noise
            elif uncertainty_noise_type == 'std_noise':
                noise = a_std_noise
            else:
                raise ValueError('Please choose a proper noise_type.')
            a = np.zeros((act_dim, ))
            if action_choose_method == 'random_sample':
                # Method 1: randomly sample one from post sampled actions
                a = a_post[np.random.choice(n_post_action)]
            elif action_choose_method == 'gaussian_sample':
                # Method 2: estimate mean and std, then sample from a Gaussian distribution
                for a_i in range(act_dim):
                    a[a_i] = np.random.normal(a_mean[a_i], a_std_clipped[a_i],
                                              1)
            elif action_choose_method == 'mean_of_samples':
                a = a_mean
            elif action_choose_method == 'median_of_sample':
                pass
            elif action_choose_method == 'mean_and_variance_based_noise':
                a = a_mean + noise
            elif action_choose_method == 'median_and_variance_based_noise':
                a = a_median + noise
            elif action_choose_method == 'prediction_and_variance_based_noise':
                a_prediction = sess.run(pi, feed_dict={x_ph: o.reshape(1,
                                                                       -1)})[0]
                a = a_prediction + noise
            else:
                pass
        else:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
            a += act_noise * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit), \
               a_var_uncertainty, a_var_uncertainty_clipped, a_std_uncertainty, a_std_uncertainty_clipped

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action_test(o))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len, \
    ep_a_var_uncertainty,ep_a_var_uncertainty_clipped, \
    ep_a_std_uncertainty, ep_a_std_uncertainty_clipped = env.reset(), 0, False, 0, 0, 0, 0, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a, a_var_uncertainty, a_var_uncertainty_clipped, \
            a_std_uncertainty, a_std_uncertainty_clipped = get_action_train(o)
        else:
            a = env.action_space.sample()
            # TODO:
            a_var_uncertainty = 0
            a_var_uncertainty_clipped = 0
            a_std_uncertainty = 0
            a_std_uncertainty_clipped = 0

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        ep_a_var_uncertainty += a_var_uncertainty
        ep_a_var_uncertainty_clipped += a_var_uncertainty_clipped
        ep_a_std_uncertainty += a_std_uncertainty
        ep_a_std_uncertainty_clipped += a_std_uncertainty_clipped

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret,
                         EpLen=ep_len,
                         EpVarUncertainty=ep_a_var_uncertainty,
                         EpVarUncertaintyClipped=ep_a_var_uncertainty_clipped,
                         EpStdUncertainty=ep_a_std_uncertainty,
                         EpStdUncertaintyClipped=ep_a_std_uncertainty_clipped)
            o, r, d, ep_ret, ep_len, \
            ep_a_var_uncertainty, ep_a_var_uncertainty_clipped, \
            ep_a_std_uncertainty, ep_a_std_uncertainty_clipped = env.reset(), 0, False, 0, 0, 0, 0, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('EpVarUncertainty', with_min_and_max=True)
            logger.log_tabular('EpVarUncertaintyClipped',
                               with_min_and_max=True)
            logger.log_tabular('EpStdUncertainty', with_min_and_max=True)
            logger.log_tabular('EpStdUncertaintyClipped',
                               with_min_and_max=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #2
0
def oac(env_fn, actor_critic=mlp_actor_critic,
                logger_kwargs=dict(),
                network_params=dict(),
                rl_params=dict()):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # control params
    seed                = rl_params['seed']
    epochs              = rl_params['epochs']
    steps_per_epoch     = rl_params['steps_per_epoch']
    replay_size         = rl_params['replay_size']
    batch_size          = rl_params['batch_size']
    start_steps         = rl_params['start_steps']
    max_ep_len          = rl_params['max_ep_len']
    save_freq           = rl_params['save_freq']
    render              = rl_params['render']

    # rl params
    gamma               = rl_params['gamma']
    polyak              = rl_params['polyak']
    lr                  = rl_params['lr']
    state_hist_n        = rl_params['state_hist_n']
    grad_clip_val       = rl_params['grad_clip_val']

    # entropy params
    alpha                = rl_params['alpha']
    target_entropy_start = rl_params['target_entropy_start']
    target_entropy_stop  = rl_params['target_entropy_stop']
    target_entropy_steps = rl_params['target_entropy_steps']

    # optimistic exploration params
    use_opt             = rl_params['use_opt']
    beta_UB             = rl_params['beta_UB']
    beta_LB             = rl_params['beta_LB']
    delta               = rl_params['delta']
    opt_lr              = rl_params['opt_lr']
    max_opt_steps       = rl_params['max_opt_steps']

    train_env, test_env = env_fn(), env_fn()
    obs_space = train_env.observation_space
    act_space = train_env.action_space

    try:
        obs_dim = obs_space.n
        observation_type = 'Discrete'
    except AttributeError as e:
        obs_dim = obs_space.shape[0]
        observation_type = 'Box'

    act_dim = act_space.n

    # set the seed
    tf.set_random_seed(seed)
    np.random.seed(seed)
    train_env.seed(seed)
    train_env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim*state_hist_n, act_dim=act_dim, size=replay_size)

    # init a state buffer for storing last m states
    train_state_buffer = StateBuffer(m=state_hist_n)
    test_state_buffer  = StateBuffer(m=state_hist_n)

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim*state_hist_n, act_dim, obs_dim*state_hist_n, None, None)

    # alpha and entropy setup
    max_target_entropy = tf.log(tf.cast(act_dim, tf.float32))
    target_entropy_prop_ph =  tf.placeholder(dtype=tf.float32, shape=())
    target_entropy = max_target_entropy * target_entropy_prop_ph

    log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)

    if alpha == 'auto': # auto tune alpha
        alpha = tf.exp(log_alpha)
    else: # fixed alpha
        alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha)

    # Main outputs from computation graph
    # with tf.variable_scope('main'):
    #     mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits, q1_a, q2_a = actor_critic(x_ph, a_ph, **network_params)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits  = actor_critic(x_ph, a_ph, **network_params)

    with tf.variable_scope('main', reuse=True):
        _, _, action_probs_next, log_action_probs_next, _, _, _  =  actor_critic(x2_ph, a_ph, **network_params)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, q1_logits_targ, q2_logits_targ = actor_critic(x2_ph, a_ph, **network_params)

    # Count variables
    var_counts = tuple(count_vars(scope) for scope in ['log_alpha',
                                                       'main/pi',
                                                       'main/q1',
                                                       'main/q2',
                                                       'main'])
    print("""\nNumber of parameters:
             alpha: %d,
             pi: %d,
             q1: %d,
             q2: %d,
             total: %d\n"""%var_counts)
             
    if use_opt:
        # Optimistic Exploration
        mu_Q    = (q1_logits + q2_logits) / 2.0
        sigma_Q = tf.math.abs(q1_logits - q2_logits) / 2.0

        Q_UB = mu_Q + beta_UB * sigma_Q
        Q_LB = mu_Q + beta_LB * sigma_Q

        Q_UB_sm = tf.nn.softmax(Q_UB, axis=-1) # needed to make EV and penalty proportional for optimisation

        R = tf.get_variable('R', dtype=tf.float32, shape=[1,act_dim], initializer=tf.random_normal_initializer(mean=0.0, stddev=0.01))
        assign_R = R.assign(action_logits) # initialises P as the same "pessimistic" action distribution
        P = tf.nn.softmax(R, axis=-1)

        expected_value = tf.reduce_sum( tf.multiply(P, Q_UB_sm) )
        KL_P_PT = tf.reduce_sum( tf.multiply(P, tf.log( tf.divide(P, action_probs) ) ) )
        penalty = KL_P_PT - delta
        relu_penalty = tf.nn.relu(penalty)
        penalised_opt_function = - expected_value + relu_penalty

        optpi_optimizer = tf.train.AdamOptimizer(learning_rate=opt_lr)
        train_optpi_op  = optpi_optimizer.minimize(penalised_opt_function, var_list=get_vars('R'))

        optimistic_policy_dist = tf.distributions.Categorical(probs=P)
        optimistic_pi = optimistic_policy_dist.sample()
    else:
        optimistic_pi = pi # use standard SAC policy
        Q_LB = tf.minimum(q1_logits, q2_logits)

    # Min Double-Q:
    min_q_logits_targ  = tf.minimum(q1_logits_targ, q2_logits_targ)

    # Targets for Q regression
    q_backup = r_ph + gamma*(1-d_ph)*tf.stop_gradient( tf.reduce_sum(action_probs_next * (min_q_logits_targ - alpha * log_action_probs_next), axis=-1))

    # critic losses
    q1_a  = tf.reduce_sum(tf.multiply(q1_logits, a_ph), axis=1)
    q2_a  = tf.reduce_sum(tf.multiply(q2_logits, a_ph), axis=1)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
    value_loss = q1_loss + q2_loss

    # policy loss
    pi_backup = tf.reduce_sum(action_probs * ( alpha * log_action_probs - Q_LB ), axis=-1)
    pi_loss = tf.reduce_mean(pi_backup)

    # alpha loss for temperature parameter
    pi_entropy = -tf.reduce_sum(action_probs * log_action_probs, axis=-1)
    alpha_backup = tf.stop_gradient(target_entropy - pi_entropy)
    alpha_loss   = -tf.reduce_mean(log_alpha * alpha_backup)

    # Policy train op
    # (has to be separate from value train op, because q1_logits appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    if grad_clip_val is not None:
        gvs = pi_optimizer.compute_gradients(pi_loss,  var_list=get_vars('main/pi'))
        capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
        train_pi_op = pi_optimizer.apply_gradients(capped_gvs)
    else:
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_pi_op]):
        if grad_clip_val is not None:
            gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q'))
            capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
            train_value_op = value_optimizer.apply_gradients(capped_gvs)
        else:
            train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q'))

    alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_value_op]):
        train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha'))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a,
                pi_entropy, target_entropy,
                alpha_loss, alpha,
                train_pi_op, train_value_op, train_alpha_op, target_update]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                                outputs={'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a})


    def get_action(state, deterministic=False):

        # # record data for printing
        # _ =  sess.run(assign_R, feed_dict={x_ph: [state]})
        # ins = sess.run([action_probs, Q_UB, P, KL_P_PT], feed_dict={x_ph: [state]})

        if deterministic:
            act_op = mu
        else:
            if use_opt:
                # run a few optimisation steps to set optimistic policy
                _ =  sess.run(assign_R, feed_dict={x_ph: [state]})
                for i in range(max_opt_steps):
                    _ = sess.run([train_optpi_op], feed_dict={x_ph: [state]})

            act_op = optimistic_pi

        # # print difference between pessimistic and optimistic policy probabilities
        # outs = sess.run([P, KL_P_PT], feed_dict={x_ph: [state]})
        #
        # print('ap:     ', ins[0])
        # print('Q:      ', ins[1])
        # print('P_in:   ', ins[2])
        # print('P_out:  ', outs[0])
        # print('KL_in:  ', ins[3])
        # print('KL_out: ', outs[1])
        # print('')

        return sess.run(act_op, feed_dict={x_ph: [state]})[0]

    def reset(env, state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        o = process_observation(o, obs_dim, observation_type)
        r = process_reward(r)
        state = state_buffer.init_state(init_obs=o)
        return o, r, d, ep_ret, ep_len, state

    def test_agent(n=10, render=True):
        global sess, mu, pi, q1_a, q2_a
        for j in range(n):
            o, r, d, ep_ret, ep_len, test_state = reset(test_env, test_state_buffer)

            if render: test_env.render()

            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(test_state, True))
                o = process_observation(o, obs_dim, observation_type)
                r = process_reward(r)
                test_state = test_state_buffer.append_state(o)
                ep_ret += r
                ep_len += 1

                if render: test_env.render()

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
        if render: test_env.close()

    start_time = time.time()
    o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer)
    total_steps = steps_per_epoch * epochs

    target_entropy_prop = linear_anneal(current_step=0, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps)

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy.
        """
        if t > start_steps:
            a = get_action(state)
        else:
            a = train_env.action_space.sample()

        # Step the env
        o2, r, d, _ = train_env.step(a)
        o2 = process_observation(o2, obs_dim, observation_type)
        a = process_action(a, act_dim)
        r = process_reward(r)
        next_state = train_state_buffer.append_state(o2)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(state, a, r, next_state, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph:  batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph:  batch['acts'],
                             r_ph:  batch['rews'],
                             d_ph:  batch['done'],
                             target_entropy_prop_ph: target_entropy_prop
                            }

                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],    LossQ2=outs[2],
                             Q1Vals=outs[3],    Q2Vals=outs[4],
                             PiEntropy=outs[5], TargEntropy=outs[6],
                             LossAlpha=outs[7], Alpha=outs[8])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer)


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # update target entropy every epoch
            target_entropy_prop = linear_anneal(current_step=t, start=target_entropy_start, stop=target_entropy_stop, steps=target_entropy_steps)

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': train_env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(n=10,render=render)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('PiEntropy', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

    plot_progress(os.path.join(logger_kwargs['output_dir'],'progress.txt'), show_plot=False)
コード例 #3
0
def d3pg(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         without_start_steps=True,
         batch_size=100,
         start_steps=10000,
         without_delay_train=False,
         reward_scale=1,
         uncertainty_driven_exploration=True,
         n_post=100,
         concentration_factor=0.5,
         uncertainty_policy_delay=5000,
         act_noise=0.1,
         target_noise=0.2,
         noise_clip=0.5,
         policy_delay=2,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    # TODO: Test no start steps
    if without_start_steps:
        start_steps = batch_size

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    hidden_sizes = list(ac_kwargs['hidden_sizes'])
    actor_hidden_activation = tf.keras.activations.relu
    actor_output_activation = tf.keras.activations.tanh
    critic_hidden_activation = tf.keras.activations.relu
    critic_output_activation = tf.keras.activations.linear
    LOG_VAR_MIN = -20
    LOG_VAR_MAX = 20  #20
    # Main actor-critic
    with tf.variable_scope('main'):
        actor = MLP(hidden_sizes + [act_dim],
                    hidden_activation=actor_hidden_activation,
                    output_activation=tf.keras.activations.tanh)

        # critic = MLP(hidden_sizes + [1],
        #              hidden_activation=critic_hidden_activation,
        #              output_activation=critic_output_activation)
        dueling_critic = DuelingMLP()

        pi = act_limit * actor(x_ph)
        v_out, a_out, q_out = dueling_critic(None,
                                             state=x_ph,
                                             action=a_ph,
                                             optimal_action=pi)
        v_out, a_out, q_out = tf.squeeze(v_out, axis=1), tf.squeeze(
            a_out, axis=1), tf.squeeze(q_out, axis=1)
        _, a_pi_out, q_pi_out = dueling_critic(None,
                                               state=x_ph,
                                               action=pi,
                                               optimal_action=pi)
        a_pi_out, q_pi_out = tf.squeeze(a_pi_out, axis=1), tf.squeeze(q_pi_out,
                                                                      axis=1)

    # Target actor-critic
    with tf.variable_scope('target'):
        actor_targ = MLP(hidden_sizes + [act_dim],
                         hidden_activation=actor_hidden_activation,
                         output_activation=actor_output_activation)

        # critic_targ = MLP(hidden_sizes + [1],
        #                   hidden_activation=critic_hidden_activation,
        #                   output_activation=critic_output_activation)
        dueling_critic_targ = DuelingMLP()

        pi_targ = act_limit * actor_targ(x2_ph)
        _, a_pi_out_targ, q_pi_out_targ = dueling_critic(None,
                                                         state=x2_ph,
                                                         action=pi_targ,
                                                         optimal_action=pi)
        q_pi_out_targ = tf.squeeze(q_pi_out_targ, axis=1)

    # Create LazyBernoulliDropoutMLP:
    #       which copys weights from MLP by
    #           sess.run(lazy_ber_drop_mlp_update)
    #       , then post sample predictions with dropout masks.
    # with tf.variable_scope('LazyBernoulliDropoutUncertaintySample'):
    #     # define placeholder for parallel sampling
    #     #   batch x n_post x dim
    #     lazy_bernoulli_dropout_actor = BeroulliDropoutMLP(hidden_sizes + [act_dim], weight_regularizer=1e-6, dropout_rate=0.05,
    #                                                       hidden_activation=actor_hidden_activation,
    #                                                       output_activation=actor_output_activation)
    #     lazy_ber_drop_pi = act_limit*lazy_bernoulli_dropout_actor(x_ph, training=True, duplicate_input=False)  # Set training=True to sample with dropout masks
    #     lazy_ber_drop_actor_update = tf.group([tf.assign(v_lazy_ber_drop_mlp, v_mlp)
    #                                            for v_mlp, v_lazy_ber_drop_mlp in
    #                                            zip(actor.variables, lazy_bernoulli_dropout_actor.variables)])

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 logger_fname='experiences_log.txt',
                                 **logger_kwargs)

    # Count variables
    print('\nNumber of parameters: \t pi: {:d}, \t q: {:d}, \t total: {:d}\n'.
          format(count_vars(actor.variables),
                 count_vars(dueling_critic.variables),
                 count_vars(get_vars('main'))))

    # Bellman backup for Q functions, using Clipped Double-Q targets
    backup = tf.stop_gradient(r_ph * reward_scale + gamma *
                              (1 - d_ph) * q_pi_out_targ)

    #  losses
    pi_loss = -tf.reduce_mean(q_pi_out)
    # pi_loss = -tf.reduce_mean(a_pi_out)
    q_loss = tf.reduce_mean((q_out - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables)
    train_q_op = q_optimizer.minimize(q_loss,
                                      var_list=dueling_critic.variables)

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # sess.run(lazy_ber_drop_actor_update)

    # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})

    # def get_uncertainty_driven_explore_action(o):
    #     o_post_samples = np.matlib.repmat(o.reshape(1,-1), n_post, 1)  # repmat x for post sampling
    #
    #     # 1. Generate action Prediction
    #     a_pred = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
    #
    #     # 2. Generate post sampled actions
    #     a_post = sess.run(lazy_ber_drop_pi, feed_dict={x_ph: o_post_samples})
    #
    #     a = np.zeros((act_dim,))
    #     if act_dim > 1:
    #         a_cov = np.cov(a_post, rowvar=False)
    #         a_cov_shaped = concentration_factor * a_cov
    #         a = np.random.multivariate_normal(a_pred, a_cov_shaped, 1)[0]
    #         unc_a = a_cov
    #     else:
    #         a_std = np.std(a_post, axis=0)
    #         a_std_shaped = concentration_factor * a_std
    #         a = np.random.normal(a_pred, a_std_shaped, 1)[0]
    #         unc_a = a_std
    #
    #     a = np.clip(a, -act_limit, act_limit)
    #     # TODO: logdet as intrinsic reward
    #     return a, unc_a

    def get_gaussian_noise_explore_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def get_action_test(o):
        """Get deterministic action without exploration."""
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        # print('test a={}'.format(a))
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action_test(o))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len, ep_unc = env.reset(), 0, False, 0, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            if uncertainty_driven_exploration:
                # if t%uncertainty_policy_delay==0:
                #     sess.run(lazy_ber_drop_actor_update)
                # a, unc_a = get_uncertainty_driven_explore_action(o)
                pass
            else:
                a = get_gaussian_noise_explore_action(o, act_noise)
        else:
            a = env.action_space.sample()

        if t < start_steps or (not uncertainty_driven_exploration):
            unc_a = np.zeros((act_dim, act_dim))

        # print(t)
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        ep_unc += np.sum(unc_a)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o,
                            a,
                            r,
                            o2,
                            d,
                            t,
                            steps_per_epoch,
                            start_time,
                            unc_a=unc_a)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # if without_delay_train:
        #     batch = replay_buffer.sample_batch(batch_size)
        #     feed_dict = {x_ph: batch['obs1'],
        #                  x2_ph: batch['obs2'],
        #                  a_ph: batch['acts'],
        #                  r_ph: batch['rews'],
        #                  d_ph: batch['done']
        #                  }
        #     q_step_ops = [q_loss, q_mean, tf.sqrt(tf.exp(q_logvar)), train_q_op]
        #     outs = sess.run(q_step_ops, feed_dict)
        #     logger.store(LossQ=outs[0], QVals=outs[1], QStds=outs[2])
        #
        #     # Delayed policy update
        #     outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
        #     logger.store(LossPi=outs[0])

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            if not without_delay_train:
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done']
                    }

                    q_step_ops = [q_loss, v_out, a_out, q_out, train_q_op]
                    outs = sess.run(q_step_ops, feed_dict)
                    logger.store(LossQ=outs[0],
                                 VVals=outs[1],
                                 AVals=outs[2],
                                 QVals=outs[3])
                    # print('LossQ={}, QVals={}, QLogVars={}'.format(outs[0], outs[1], outs[2]))
                    if j % policy_delay == 0:
                        # Delayed policy update
                        outs = sess.run([pi_loss, train_pi_op, target_update],
                                        feed_dict)
                        logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            logger.store(EpUnc=ep_unc)
            o, r, d, ep_ret, ep_len, ep_unc = env.reset(), 0, False, 0, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('AVals', with_min_and_max=True)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('EpUnc', with_min_and_max=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #4
0
def sac(env_fn,
        env_name,
        test_env_fns=[],
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        load_dir=None,
        num_procs=1,
        clean_every=200):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of
            observations as inputs, and ``q1`` and ``q2`` should accept a batch
            of observations and a batch of actions as inputs. When called,
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    from spinup.examples.pytorch.eval_sac import load_pytorch_policy

    print(f"SAC proc_id {proc_id()}")
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    if proc_id() == 0:
        writer = SummaryWriter(log_dir=os.path.join(
            logger.output_dir, str(datetime.datetime.now())),
                               comment=logger_kwargs["exp_name"])

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)],
                        "spawn")
    test_env = SubprocVecEnv(test_env_fns, "spawn")
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    if load_dir is not None:
        _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False)
    else:
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing TD feats-losses
    def compute_loss_feats(data):
        o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done'], data["feats"]

        feats = torch.stack(list(feats.values())).T  # (nbatch, nfeats)
        feats1 = ac.q1.predict_feats(o, a)
        feats2 = ac.q2.predict_feats(o, a)

        feats_keys = replay_buffer.feats_keys

        # Bellman backup for feature functions
        with torch.no_grad():
            a2, _ = ac.pi(o2)

            # Target feature values
            feats1_targ = ac_targ.q1.predict_feats(o2, a2)
            feats2_targ = ac_targ.q2.predict_feats(o2, a2)
            feats_targ = torch.min(feats1_targ, feats2_targ)
            backup = feats + gamma * (1 - d[:, None]) * feats_targ

        # MSE loss against Bellman backup
        loss_feats1 = ((feats1 - backup)**2).mean(axis=0)
        loss_feats2 = ((feats2 - backup)**2).mean(axis=0)
        loss_feats = loss_feats1 + loss_feats2

        # Useful info for logging
        feats_info = dict(Feats1Vals=feats1.detach().numpy(),
                          Feats2Vals=feats2.detach().numpy())

        return loss_feats, feats_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, feats_keys):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        loss_feats, feats_info = compute_loss_feats(data)
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Feature loss
        keys = [f"LossFeats_{key}" for key in feats_keys]
        for key, val in zip(keys, loss_feats):
            logger.store(**dict(key, val.item()))

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent(feats_keys):
        num_envs = len(test_env_fns)
        env_ep_rets = np.zeros(num_envs)
        for j in range(num_test_episodes):
            o, d = test_env.reset(), np.zeros(num_envs, dtype=bool)
            ep_len = np.zeros(num_envs)
            while not (np.all(d) or np.all(ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, info = test_env.step(get_action(o, True))
                env_ep_rets += r
                ep_len += 1
            # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
        for ti in range(num_envs):
            logger.store(
                **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes})

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs)

    # Main loop: collect experience in env and update/log each epoch
    epoch = 0
    update_times, clean_times = 0, 0
    t = 0
    while t <= total_steps:
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = np.stack([env.action_space.sample() for _ in range(num_procs)])

        # Step the env
        o2, r, d, info = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        if np.all(ep_len == max_ep_len):
            d.fill(False)

        # Store experience to replay buffer
        replay_buffer.store_vec(o, a, r, o2, d,
                                [inf["features"] for inf in info])

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling, assumes all subenvs end at the same time
        if np.all(d) or np.all(ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            if clean_every > 0 and epoch // clean_every >= clean_times:
                env.close()
                test_env.close()
                env = SubprocVecEnv(
                    [partial(env_fn, rank=i) for i in range(num_procs)],
                    "spawn")
                test_env = SubprocVecEnv(test_env_fns, "spawn")
                clean_times += 1

            o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(
                num_procs)

        # Update handling
        if t >= update_after and t / update_every > update_times:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, feats_keys=replay_buffer.feats_keys)
            update_times += 1

        # End of epoch handling
        if t // steps_per_epoch > epoch:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                # try:
                logger.save_state({'env_name': env_name}, None)
                # logger.save_state({'env': env}, None)
                #except:
                #logger.save_state({'env_name': env_name}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(replay_buffer.feats_keys)

            # Update tensorboard
            if proc_id() == 0:
                log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [
                    f"TestEpRet_{ti}" for ti in range(len(test_env_fns))
                ]
                log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [
                    key
                    for key in logger.epoch_dict.keys() if "LossFeats" in key
                ]
                log_board = {
                    'Performance': log_perf_board,
                    'Loss': log_loss_board
                }
                for key, value in log_board.items():
                    for val in value:
                        mean, std = logger.get_stats(val)
                        if key == 'Performance':
                            writer.add_scalar(key + '/Average' + val, mean,
                                              epoch)
                            writer.add_scalar(key + '/Std' + val, std, epoch)
                        else:
                            writer.add_scalar(key + '/' + val, mean, epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            if proc_id() == 0:
                writer.flush()

                import psutil
                # gives a single float value
                cpu_percent = psutil.cpu_percent()
                # gives an object with many fields
                mem_percent = psutil.virtual_memory().percent
                print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%")
                cpu_separate = psutil.cpu_percent(percpu=True)
                for ci, cval in enumerate(cpu_separate):
                    print(f"\t cpu {ci}: {cval}%")
                # buf_size = replay_buffer.get_size()
                # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B")
        t += num_procs

    if proc_id() == 0:
        writer.close()
コード例 #5
0
def ddpg_multihead_n_step(env_name,
                          actor_hidden_layers=[300, 300],
                          critic_shared_hidden_layers=[300],
                          critic_separated_head_hidden_layers=[300],
                          seed=0, dropout_rate = 0,
                          steps_per_epoch=5000, epochs=100, replay_size=int(1e6),
                          reward_scale = 1,
                          multi_head_multi_step_size = [1, 2, 3, 4, 5],
                          actor_omit_top_k_Q = 2, actor_omit_low_k_Q = 1,
                          critic_omit_top_k_Q = 2, critic_omit_low_k_Q = 1,
                          q_loss_type = 'QLossReduceMeanMean',
                          multihead_q_std_penalty = 0.2,
                          separate_action_and_prediction = False,
                          multi_head_bootstrapping = False,
                          target_policy_smoothing=True, target_noise = 0.2, noise_clip = 0.5,
                          random_n_step=False, random_n_step_low=1, random_n_step_high=5,
                          gamma=0.99, without_delay_train=False, obs_noise_scale=0,
                          nonstationary_env=False,
                          gravity_change_pattern = 'gravity_averagely_equal',
                          gravity_cycle = 1000, gravity_base = -9.81,
                          polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
                          act_noise=0.1, random_action_baseline=False,
                          max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture

    # Inputs to computation graph
    multi_head_size = len(multi_head_multi_step_size)

    x_ph = tf.placeholder(dtype=tf.float32, shape=(None, obs_dim))
    a_ph = tf.placeholder(dtype=tf.float32, shape=(None, act_dim))
    # TODO: use different mini-batch
    x2_ph = tf.placeholder(dtype=tf.float32, shape=(None, max(multi_head_multi_step_size), obs_dim))
    r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    n_step_ph = tf.placeholder(dtype=tf.float32, shape=())

    actor_hidden_sizes = actor_hidden_layers
    actor_hidden_activation = tf.keras.activations.relu
    actor_output_activation = tf.keras.activations.tanh
    critic_shared_hidden_sizes = critic_shared_hidden_layers
    critic_head_hidden_sizes = critic_separated_head_hidden_layers
    critic_hidden_activation = tf.keras.activations.relu
    critic_output_activation = tf.keras.activations.linear

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        actor = MLP(layer_sizes=actor_hidden_sizes+[act_dim],
                    hidden_activation=actor_hidden_activation, output_activation=actor_output_activation)
        multihead_critic = MultiHeadMLP(shared_hidden_layer_sizes=critic_shared_hidden_sizes,
                                        multi_head_layer_sizes=[critic_head_hidden_sizes+[1] for i in range(multi_head_size)],
                                        hidden_activation=critic_hidden_activation,
                                        output_activation=critic_output_activation)
        # Set training=False to ignore dropout masks
        pi = act_limit * actor(x_ph, training=False)
        multihead_q = [tf.squeeze(head_out, axis=1) for head_out in multihead_critic(tf.concat([x_ph,a_ph], axis=-1))]
        multihead_q_pi = [tf.squeeze(head_out, axis=1) for head_out in multihead_critic(tf.concat([x_ph, pi], axis=-1))]

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is 
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        actor_targ = MLP(layer_sizes=actor_hidden_sizes+[act_dim],
                         hidden_activation=actor_hidden_activation, output_activation=actor_output_activation)
        multihead_critic_targ = MultiHeadMLP(shared_hidden_layer_sizes=critic_shared_hidden_sizes,
                                             multi_head_layer_sizes=[critic_head_hidden_sizes+[1] for i in range(multi_head_size)],
                                             hidden_activation=critic_hidden_activation,
                                             output_activation=critic_output_activation)

        # Set training=False to ignore dropout for backup target value
        # Crucial: feed target networks with different next n-step observation
        multihead_q_pi_targ = []
        # for head_i in range(multi_head_size):
        for h_i, n_step in enumerate(multi_head_multi_step_size):
            print('Head-{}: {}-step'.format(h_i, n_step))
            head_x2_ph = tf.squeeze(tf.slice(x2_ph, [0, n_step-1,0], [batch_size, 1, obs_dim]), axis=1)

            _ = actor_targ(head_x2_ph) # just for copy parameter
            if separate_action_and_prediction:
                head_pi_targ = act_limit * actor(head_x2_ph)
            else:
                head_pi_targ = act_limit * actor_targ(head_x2_ph)

            if target_policy_smoothing:
                # Target policy smoothing, by adding clipped noise to target actions
                epsilon = tf.random_normal(tf.shape(head_pi_targ), stddev=target_noise)
                epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
                head_pi_targ = head_pi_targ + epsilon
                head_pi_targ = tf.clip_by_value(head_pi_targ, -act_limit, act_limit)

            # TODO: test multi-head bootstrapping with StdQPenalty
            if multi_head_bootstrapping:
                # all heads calculate n-step bootstrapping,
                #  omit overestimation and underestimation of n-step bootstrapped Q
                after_omit_overestimation = tf.math.top_k(
                    -tf.squeeze(tf.stack(multihead_critic_targ(tf.concat([head_x2_ph, head_pi_targ], axis=-1)), axis=2),
                                axis=1), multi_head_size - critic_omit_top_k_Q)[0]
                after_omit_underestimation = tf.math.top_k(-after_omit_overestimation,
                                                           multi_head_size - critic_omit_top_k_Q - critic_omit_low_k_Q)[0]
                multihead_q_pi_targ.append(tf.reduce_mean(after_omit_underestimation, axis=1))
            else:
                multihead_q_pi_targ.append(
                    tf.squeeze(multihead_critic_targ(tf.concat([head_x2_ph, head_pi_targ], axis=-1))[h_i], axis=1))


    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)

    # Bellman backup for Q function
    multihead_q_loss_list = []
    multihead_q_pi_loss_list = []
    multihead_backup_list = []
    for h_i, n_step in enumerate(multi_head_multi_step_size):
        head_q = multihead_q[h_i]
        head_q_pi_targ = multihead_q_pi_targ[h_i]
        head_q_pi = multihead_q_pi[h_i]

        head_backup = tf.stop_gradient(tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step, dtype=tf.float32))
                                                                 * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])),
                                                                 tf.slice(r_ph, [0, 0], [batch_size, n_step])), axis=1)
                                       + gamma ** n_step * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * head_q_pi_targ)
        multihead_backup_list.append(head_backup)
        multihead_q_loss_list.append(tf.reduce_mean((head_q-head_backup)**2))
        multihead_q_pi_loss_list.append(-tf.reduce_mean(head_q_pi))


    # DDPG losses
    # 1. pi loss
    all_q_pi = tf.stack(multihead_q_pi, axis=1)

    # pi_loss = tf.reduce_mean(multihead_q_pi_loss_list) # Works, but not stable

    # Works good, need to test generalization
    # pi_loss = tf.reduce_mean(tf.math.top_k(-all_q_pi, multi_head_size - omit_top_k_Q)[0])

    after_omit_overestimation_for_actor = tf.math.top_k(-all_q_pi, multi_head_size - actor_omit_top_k_Q)[0]
    after_omit_underestimation_for_actor = tf.math.top_k(-after_omit_overestimation_for_actor, multi_head_size - actor_omit_top_k_Q - actor_omit_low_k_Q)[0]
    pi_loss = tf.reduce_mean(tf.reduce_mean(-after_omit_underestimation_for_actor, axis=1))

    # # TODO:test, seems not work
    # pi_loss = tf.reduce_mean(tf.reduce_mean(tf.math.top_k(-all_q_pi, multi_head_size - actor_omit_top_k_Q)[0], axis=1) +
    #                          multihead_q_std_penalty * tf.math.reduce_variance(all_q_pi, axis=1))

    # # import pdb; pdb.set_trace()
    # pi_loss = tf.reduce_sum(tf.reduce_mean(tf.math.top_k(-tf.stack(multihead_q_pi, axis=1), multi_head_size - omit_top_k_Q)[0], axis=0))

    # pi_loss = tf.reduce_mean(-multihead_q_pi[0]) # Too slow

    # # slow
    # pi_loss = tf.reduce_mean(tf.reduce_sum(tf.math.top_k(-all_q_pi,
    #                                                       multi_head_size - actor_omit_top_k_Q)[0], axis=1))

    # 2. q loss
    all_q = tf.stack(multihead_q, axis=1)
    all_q_backup = tf.stack(multihead_backup_list, axis=1)

    if q_loss_type == 'QLossReduceMeanMean':
        q_loss = tf.reduce_mean(multihead_q_loss_list)     # works
    elif q_loss_type == 'QLossReduceSumMean':
        q_loss = tf.reduce_sum(multihead_q_loss_list)        # Works good for Swimmer-s3
    elif q_loss_type == 'QLossReduceMeanAll':
        q_loss = tf.reduce_mean((all_q - all_q_backup) ** 2) # (Currently the best) Works good for Swimmer-s0
    elif q_loss_type == 'QLossReduceSumAll':
        q_loss = tf.reduce_sum((all_q - all_q_backup) ** 2)

    # currently the best, and the policy has approximately monotonic improvement
    # TODO: multihead_q_std_penalty should be dynamically changed
    # q_loss = tf.reduce_mean(tf.reduce_mean((all_q - all_q_backup)**2, axis=1) +
    #                         multihead_q_std_penalty * tf.math.reduce_std(all_q, axis=1))

    # # variance penalty is better than standard deviation penalty
    # q_loss = tf.reduce_mean(tf.reduce_mean((all_q - all_q_backup) ** 2, axis=1) +
    #                         multihead_q_std_penalty * tf.math.reduce_variance(all_q, axis=1))

    # # TODO: test reduce_sum and reduce_var
    # q_loss = tf.reduce_mean(tf.reduce_sum((all_q - all_q_backup) ** 2, axis=1) +
    #                         multihead_q_std_penalty * tf.math.reduce_variance(all_q, axis=1))



    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables)
    train_q_op = q_optimizer.minimize(q_loss, var_list=multihead_critic.variables)

    # Polyak averaging for target variables
    target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                              for v_main, v_targ in zip(actor.variables+multihead_critic.variables,
                                                        actor_targ.variables+multihead_critic_targ.variables)])

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(actor.variables+multihead_critic.variables,
                                                        actor_targ.variables+multihead_critic_targ.variables)])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    # # TODO: delete env.render()
    # env.render()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps and not random_action_baseline:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # env.render()
        # Manipulate environment
        change_scale = 1/8
        if nonstationary_env == True:
            if gravity_change_pattern == 'gravity_averagely_equal':
                # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2
                gravity = gravity_base + np.abs(gravity_base) * change_scale * np.sin(2 * np.pi / gravity_cycle * t)
            elif gravity_change_pattern == 'gravity_averagely_easier':
                # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1)
                gravity = gravity_base * change_scale * (np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 - change_scale)
            elif gravity_change_pattern == 'gravity_averagely_harder':
                # gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base
                gravity = gravity_base * change_scale * (-np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * (
                            1 + change_scale)
            else:
                pass

            if 'PyBulletEnv' in env_name:
                env.env._p.setGravity(0, 0, gravity)
            elif 'Roboschool' in env_name:
                pass
            else:
                env.model.opt.gravity[2] = gravity
        # Step the env
        o2, r, d, _ = env.step(a)
        # Add observation noise
        o2 += obs_noise_scale * np.random.randn(obs_dim)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, reward_scale*r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        if t > batch_size and without_delay_train:
            if random_n_step:
                n_step = np.random.randint(random_n_step_low, random_n_step_high + 1, 1)[0]

            batch = replay_buffer.sample_batch_multihead_n_step(batch_size, n_step_end=max(multi_head_multi_step_size))
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done']
                         }
            # import pdb; pdb.set_trace()
            # Q-learning update
            outs = sess.run([multihead_q_loss_list, multihead_q, train_q_op], feed_dict)
            logger.store(**{'LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in
                            range(multi_head_size)})
            logger.store(**{'QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[1][h_i] for h_i in
                            range(multi_head_size)})

            # Policy update
            outs = sess.run([multihead_q_pi_loss_list, train_pi_op, target_update], feed_dict)
            logger.store(**{'LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in
                            range(multi_head_size)})


        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            if not without_delay_train:
                for _ in range(ep_len):
                    if random_n_step:
                        n_step = np.random.randint(random_n_step_low, random_n_step_high+1, 1)[0]
                    batch = replay_buffer.sample_batch_multihead_n_step(batch_size, n_step_end=max(multi_head_multi_step_size))
                    feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                }

                    # Q-learning update
                    outs = sess.run([multihead_q_loss_list, multihead_q, train_q_op], feed_dict)
                    logger.store(**{'LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in
                                    range(multi_head_size)})
                    logger.store(**{'QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[1][h_i] for h_i in
                                    range(multi_head_size)})

                    # Policy update
                    outs = sess.run([multihead_q_pi_loss_list, train_pi_op, target_update], feed_dict)
                    logger.store(**{'LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]): outs[0][h_i] for h_i in
                                    range(multi_head_size)})

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            for h_i in range(multi_head_size):
                logger.log_tabular('QVals{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), with_min_and_max=True)
            for h_i in range(multi_head_size):
                logger.log_tabular('LossPi{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), average_only=True)
            for h_i in range(multi_head_size):
                logger.log_tabular('LossQ{}_{}Step'.format(h_i, multi_head_multi_step_size[h_i]), average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #6
0
ファイル: ppo.py プロジェクト: MishaLaskin/spinningup
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #7
0
ファイル: td3.py プロジェクト: schrammlb2/spinningup-clone
def td3(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=250,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        use_grad_penalty=True,
        penalty_scale=.025):
    """
    Twin Delayed Deep Deterministic Policy Gradient (TD3)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device=DEVICE)
    ac_targ = deepcopy(ac).to(device=DEVICE)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    clip_val = 10
    for p in ac.parameters():
        p.register_hook(lambda grad: torch.clamp(grad, -clip_val, clip_val))
        p.register_hook(lambda grad: torch.where(
            grad != grad, torch.tensor(0., device=DEVICE), grad))

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing TD3 Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        if use_grad_penalty:
            pi_targ = ac_targ.pi(o2)

            # Target policy smoothing
            epsilon = torch.randn_like(pi_targ) * target_noise
            epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = torch.clamp(a2, -act_limit, act_limit)

            # Target Q-values
            q1_pi_targ = gradient_penalty(ac_targ.q1,
                                          o2,
                                          a2,
                                          epsilon=penalty_scale)
            q2_pi_targ = gradient_penalty(ac_targ.q2,
                                          o2,
                                          a2,
                                          epsilon=penalty_scale)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        else:
            with torch.no_grad():
                pi_targ = ac_targ.pi(o2)

                # Target policy smoothing
                epsilon = torch.randn_like(pi_targ) * target_noise
                epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
                a2 = pi_targ + epsilon
                a2 = torch.clamp(a2, -act_limit, act_limit)

                # Target Q-values
                q1_pi_targ = ac_targ.q1(o2, a2)
                q2_pi_targ = ac_targ.q2(o2, a2)
                q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
                backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        loss_info = dict(Q1Vals=q1.cpu().detach().numpy(),
                         Q2Vals=q2.cpu().detach().numpy())

        return loss_q, loss_info

    # Set up function for computing TD3 pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q1_pi = ac.q1(o, ac.pi(o))
        return -q1_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(q_params, lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, timer):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        torch.nn.utils.clip_grad_value_(q_params, clip_val)
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **loss_info)

        # Possibly update pi and target networks
        if timer % policy_delay == 0:

            # Freeze Q-networks so you don't waste computational effort
            # computing gradients for them during the policy learning step.
            for p in q_params:
                p.requires_grad = False

            # Next run one gradient descent step for pi.
            pi_optimizer.zero_grad()
            loss_pi = compute_loss_pi(data)
            loss_pi.backward()
            torch.nn.utils.clip_grad_value_(ac.pi.parameters(), clip_val)
            pi_optimizer.step()

            # Unfreeze Q-networks so you can optimize it at next DDPG step.
            for p in q_params:
                p.requires_grad = True

            # Record things
            logger.store(LossPi=loss_pi.item())

            # Finally, update target networks by polyak averaging.
            with torch.no_grad():
                for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                    # NB: We use an in-place operations "mul_", "add_" to update target
                    # params, as opposed to "mul" and "add", which would make new tensors.
                    p_targ.data.mul_(polyak)
                    p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32, device=DEVICE))
        a += noise_scale * np.random.randn(act_dim)
        if not np.isfinite(a).all():
            pdb.set_trace()
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def test_agent_transfer():
        worst_case = np.inf
        for j in range(num_test_episodes):
            test_env = env_fn(transfer=True)
            # test_env = env_fn()
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            worst_case = min(ep_ret, worst_case)
            logger.store(TransferEpRet=ep_ret, TransferEpLen=ep_len)
        # logger.store(WorstTransferEpRet=worst_case)

    def test_agent_random():
        worst_case = np.inf
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o += np.random.normal(0, .01, o.shape)
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                o += np.random.normal(0, .01, o.shape)
                ep_ret += r
                ep_len += 1
            worst_case = min(ep_ret, worst_case)
            logger.store(RandomEpRet=ep_ret, RandomEpLen=ep_len)

    def test_agent_adversarial_noise():
        def adv_step(o):
            tens_o = torch.as_tensor(o, device=DEVICE)
            v = lambda obs: ac.q1(tens_o, ac.pi(obs))
            #Value of policy given perturbed observation
            adv_obs = state_gradient(v, tens_o, epsilon=2e-2)
            #Bounded adversarial perturbation to observation
            return adv_obs.cpu().numpy()

        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o = adv_step(o)
            # o += np.random.normal(1, .01, o.shape)
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                o = adv_step(o)
                # o += np.random.normal(1, .01, o.shape)
                ep_ret += r
                ep_len += 1
            logger.store(AdvEpRet=ep_ret, AdvEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, timer=j)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()
            test_agent_transfer()
            test_agent_random()
            # test_agent_adversarial_noise()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('TransferEpRet', with_min_and_max=True)
            logger.log_tabular('RandomEpRet', with_min_and_max=True)
            # logger.log_tabular('AdvEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TransferEpLen', average_only=True)
            logger.log_tabular('RandomEpLen', average_only=True)
            # logger.log_tabular('AdvEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #8
0
ファイル: hand_dqn.py プロジェクト: DaanGoslinga/gym-mediator
class DQNAgent:
    """DQN Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        memory (ReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        epsilon (float): parameter for epsilon greedy policy
        max_epsilon (float): max value of epsilon
        min_epsilon (float): min value aof epsilon
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
                           state, action, reward, next_state, done
    """
    def __init__(
        self,
        env: gym.Env,
        replay_size: int,
        batch_size: int,
        target_update: int,
        update_after: int,
        update_every: int,
        logger_kwargs,
    ):

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        seed = 0
        torch.manual_seed(seed)
        np.random.seed(seed)

        # obs_dim = len(env.observation_space.spaces)
        # action_dim = env.action_space.n
        obs_dim = 5
        action_dim = 3

        self.env = env
        self.replaybuffer = ReplayBuffer(obs_dim, replay_size, batch_size)
        self.batch_size = batch_size

        self.epsilon = 0.1

        self.target_update = target_update
        self.gamma = 0.9

        self.update_after = update_after
        self.update_every = update_every

        self.action_dict = {
            'Do_Nothing': 0,
            'Emergency_CA': 1,
            'Suggested_Shift_L4': 2,
            'Shift_L4': 3,
            'Correct_Distraction': 4,
        }

        self.sub_action_dict = {
            'Suggested_Shift_L4': 2,
            'Shift_L4': 3,
            'Correct_Distraction': 4,
        }

        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # networks: dqn, dqn_target
        self.dqn = Network(obs_dim, action_dim).to(self.device)
        self.dqn_target = Network(obs_dim, action_dim).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        # optimizer, only for self.dqn
        self.optimizer = Adam(self.dqn.parameters(), lr=0.001)
        self.scheduler = lr_scheduler.StepLR(self.optimizer,
                                             step_size=5,
                                             gamma=0.8)

        # Set up model saving
        self.logger.setup_pytorch_saver(self.dqn)

        # transition to store in memory
        self.transition = list()

        # mode: train / test
        self.is_test = False

    def simple_case_action(self, state: np.ndarray) -> np.ndarray:
        '''
        Choose action in normal and critical situation
        '''
        # observations
        d_att = int(state[0][0])
        d_pp = int(state[0][3])
        auto_mode = int(state[0][4])
        c_sc = int(state[0][14])

        driver_fit_manual = auto_mode == 0 and d_att == 0
        collision_risk = c_sc == 1
        with_preference = d_pp == 1

        # Initialization of action
        # if action ==-10, then it is not normal nor critical.
        action = -10

        # Status 1: Critical
        if collision_risk:
            action = self.action_dict[
                'Emergency_CA']  # DiscreteAction.Emergency_CA

        # Status 2: Normal
        if (driver_fit_manual
                and not with_preference) and (not collision_risk):
            action = self.action_dict[
                'Do_Nothing']  # DiscreteAction.Do_Nothing

        # Status: SSL4 feedback
        f_dc = int(state[0][18])  # 0 - no reponse; 1 - accept;  2 - reject
        f_as = int(
            state[0]
            [19])  # 0 - do nothing; 1 - correct; 2 - suggest shift to L4

        # SSL4 and feedbacks
        SSL4_last_time = f_as == 2
        no_response_ssl4 = f_dc == 0 and SSL4_last_time
        accept_ssl4 = f_dc == 1 and SSL4_last_time
        reject_ssl4 = f_dc == 2 and SSL4_last_time

        if no_response_ssl4:
            # <No Response> and then repreat SSL4
            action = self.action_dict[
                'Suggested_Shift_L4']  # Suggested Shift L4

        elif accept_ssl4:
            # <Accept>  and Shift to L4
            action = self.action_dict['Shift_L4']  # Shift_L4

        elif reject_ssl4:
            # <Reject> and Repeat Correction Distraction
            action = self.action_dict[
                'Correct_Distraction']  # Correct Distraction

        return action

    def complex_case_action(self, last_state: np.ndarray,
                            state: np.ndarray) -> np.ndarray:
        '''
        Choose action using using decision trees in complex situation
        '''

        # current observations
        d_att = int(state[0][0])
        d_fat = int(state[0][1])
        comfort = int(state[0][2])
        d_pp = int(state[0][3])

        auto_mode = int(state[0][4])
        L_max_now = int(state[0][5])
        L_max_next = int(state[0][6])

        c_sc = int(state[0][14])
        c_ue = int(state[0][15])
        c_vs = int(state[0][16])

        f_dc = int(state[0][18])  # 0 - no reponse; 1 - accept;  2 - reject
        f_as = int(
            state[0]
            [19])  # 0 - do nothing; 1 - correct; 2 - suggest shift to L4

        # last observations
        last_d_att = int(last_state[0][0])

        # initial values
        action = -10

        #--------------------------------------------------------#
        # Status 3: Degraded driver behavior

        # Correction
        distraction_begin = last_d_att == 0 and d_att == 1
        DN_last_time = f_as == 0

        if distraction_begin and DN_last_time:
            action = self.action_dict['Correct_Distraction']

        CD_last_time = f_as == 1
        distraction_eliminate = last_d_att == 1 and d_att == 0
        distraction_still = last_d_att == 1 and d_att == 1
        L4_available = L_max_now == 3

        if CD_last_time and distraction_eliminate:
            # Correction works
            action = self.action_dict['Do_Nothing']

        elif CD_last_time and distraction_still:
            # Correction fails
            if L4_available and f_dc != 2:
                # L4_available and not being rejected before
                action = self.action_dict['Suggested_Shift_L4']

            elif not L4_available:
                # L4 unavailable and thus repeat corrections
                action = self.action_dict['Correct_Distraction']

        # SSL4 and feedbacks
        SSL4_last_time = f_as == 2
        no_response_ssl4 = f_dc == 0 and SSL4_last_time
        accept_ssl4 = f_dc == 1 and SSL4_last_time
        reject_ssl4 = f_dc == 2 and SSL4_last_time

        if no_response_ssl4:
            # <No Response> and then repreat SSL4
            action = self.action_dict[
                'Suggested_Shift_L4']  # Suggested Shift L4

        elif accept_ssl4:
            # <Accept>  and Shift to L4
            action = self.action_dict['Shift_L4']  # Shift_L4

        elif reject_ssl4:
            # <Reject> and Repeat Correction Distraction
            action = self.action_dict[
                'Correct_Distraction']  # Correct Distraction

        #-------------------------------------------------------#
        if action == -10:
            print('Warning: no action generated from rule-based algorithm!')
            action_type, action = random.choice(
                list(self.sub_action_dict.items()))

        return action

    def get_action(self, state: np.ndarray, test_stage) -> np.ndarray:
        """ Select an action from the input state based on the rl policy: 
        epsilon greedy policy
        """

        if not test_stage and (np.random.random() < self.epsilon):
            # selected_action = self.env.action_space.sample()
            action_type, selected_action = random.choice(
                list(self.sub_action_dict.items()))

        else:

            selected_action = self.dqn(
                torch.FloatTensor(state).to(self.device)).argmax()

            # selected_action = selected_action.detach().cpu().numpy()
            # TODO +2
            selected_action = selected_action.detach().cpu().numpy() + 2

        return selected_action

    def custom_obs(self, state):
        '''
        Return customized the state
        '''
        d_att = int(state[0][0])
        f_dc = int(
            state[0]
            [18])  # 0 - no reponse; 1 - accept;  2 - reject; -1-inactivated
        f_as = int(state[0][19])
        auto_mode = int(state[0][4])
        L_max_now = int(state[0][5])

        cd_activate = 1 if f_as == 1 else 0
        L4_available = 1 if L_max_now == 3 else 0
        ssl4_activate = 1 if f_as == 2 else 0

        c_state = []
        c_state.append([d_att, cd_activate, L4_available, ssl4_activate, f_dc])

        return np.array(c_state)

    def run(self, num_epoch: int, episodes_per_epoch: int, test_episodes: int):
        """Train the agent.
        episode_reward: lists of episode rewards
        sum_rew: float: reward of each iteration
        iter_time: the number of time steps
        update_cnt: determine the frequency of updating the target network

        """
        print('********************* Training Starts *********************')

        self.is_test = False

        model_loss, epsilons = [], []
        episode_id, step_id, iter_time, update_cnt = 0, 0, 0, 0
        sum_rew = 0.0

        state = self.env.reset()
        last_state = state
        already_starts = False
        episode_begin = 1e8
        total_episode = num_epoch * episodes_per_epoch

        while episode_id < total_episode:

            step_id += 1
            cus_state = self.custom_obs(state)

            # 1: >>>> Choose action
            action = self.simple_case_action(state)
            if action == -10:
                if iter_time < 2000:
                    action_type, action = random.choice(
                        list(self.sub_action_dict.items()))
                else:
                    action = self.get_action(cus_state, False)

            # 2: >>>> Run episode
            next_state, reward, done, d_info = self.env.step(action)

            # 3: >>>> To determine the moment when data starts to be saved in the replay buffer
            mydict = self.action_dict
            d_att_last = int(last_state[0][0])
            d_att_now = int(state[0][0])
            d_att_next = int(next_state[0][0])
            l4_next = 1 if int(next_state[0][5]) == 3 else 0

            degradation_begin = d_att_now == 0 and d_att_next == 1
            L4_available = int(state[0][5]) == 3

            TESD = state[0][12]

            if degradation_begin and not already_starts:
                episode_begin = step_id
                already_starts = True

            if not self.is_test and step_id >= episode_begin:
                if d_att_now == 1:
                    # put distraction states into replay buffer
                    cus_next_state = self.custom_obs(next_state)
                    self.transition = [
                        cus_state, action, reward, cus_next_state, done
                    ]
                    self.replaybuffer.store(*self.transition)
                    iter_time = iter_time + 1
                    print(
                        'Episode:{}, Step:{}, Iteration:{}, State[d_att,cd_activate,L4_available,ssl4_activate,f_dc]:{}'
                        .format(episode_id, step_id, iter_time, cus_state[0]))
                    print(
                        'Dis_Last:{}, Dis_Now:{}, Dis_Next:{},L4_Next:{}, Reward+Cost:{}, Action:{}'
                        .format(
                            d_att_last, d_att_now, d_att_next, l4_next, reward,
                            list(mydict.keys())[list(
                                mydict.values()).index(action)]))

            # 4: >>>> Update state and sum of rewards

            state = next_state
            sum_rew += reward

            # 5. >>>> End and Reset
            if done:
                print('Done infos: ', d_info)
                print('Return(Sum of Rewards):{}'.format(round(sum_rew, 1)))
                print(
                    '-------------------------------------------------------------------------------------------------------------------------'
                )

                # TODO
                self.logger.store(EpRet=sum_rew)

                # reset env
                state = self.env.reset()
                last_state = state
                sum_rew = 0.0
                episode_id = episode_id + 1
                step_id = 0
                already_starts = False
                episode_begin = 1e8

            # 6. >> Update Model Parameters
            if (iter_time >= self.update_after) and (iter_time %
                                                     self.update_every == 0):
                for j in range(self.update_every):
                    self.update_model()
                    update_cnt += 1
                    if update_cnt % self.target_update == 0:
                        self._target_hard_update()

            # 7. Save and log information
            if (iter_time >= self.update_after
                    and done) and (episode_id + 1) % episodes_per_epoch == 0:

                # Epoch information
                epoch = episode_id // episodes_per_epoch

                self.scheduler.step()
                # self.lr_list.append(optimizer.state_dict()['param_groups'][0]['lr'])

                # Save model
                self.logger.save_state({'env': self.env}, None)

                # Test the performance of the agent
                self.test_agent(test_episodes)

                # Save important info
                self.logger.log_tabular('Epoch', epoch)
                self.logger.log_tabular('EpRet', with_min_and_max=True)
                self.logger.log_tabular('TestEpRet', with_min_and_max=True)
                self.logger.log_tabular('QVals', with_min_and_max=True)
                self.logger.log_tabular('LossQ', average_only=True)
                self.logger.log_tabular('TotalEnvInteracts', iter_time)
                self.logger.dump_tabular()

    def update_model(self):
        """Update the model by gradient descent."""

        samples = self.replaybuffer.sample_batch()
        loss_q, q_info = self._compute_dqn_loss(samples)
        self.optimizer.zero_grad()
        loss_q.backward()
        self.optimizer.step()

        self.logger.store(LossQ=loss_q.item(), **q_info)

    def _target_hard_update(self):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())

    def _compute_dqn_loss(self, samples: Dict[str,
                                              np.ndarray]) -> torch.Tensor:
        """Return dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["acts"].reshape(-1, 1)).to(device)
        reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device)
        done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device)

        # G_t   = r + gamma * v(s_{t+1})  if state != Terminal
        #       = r                       otherwise
        curr_q_value = self.dqn(state).gather(1, action)

        next_q_value = self.dqn_target(next_state).gather(  # Double DQN
            1,
            self.dqn(next_state).argmax(dim=1, keepdim=True)).detach()
        mask = 1 - done
        target = (reward + self.gamma * next_q_value * mask).to(
            self.device)  # ground truth

        # calculate dqn loss
        loss_fun = torch.nn.MSELoss().to(self.device)
        loss_q = loss_fun(curr_q_value, target)
        loss_info = dict(QVals=curr_q_value.detach().numpy())

        return loss_q, loss_info

    def test_agent(self, test_episodes):
        """ Test the agent """
        for j in range(test_episodes):
            sum_rew, done, state = 0.0, False, self.env.reset()
            while not done:
                action = self.simple_case_action(state)
                if action == -10:
                    cus_state = self.custom_obs(state)
                    action = self.get_action(cus_state, True)
                next_state, reward, done, infos = self.env.step(action)
                state = next_state
                sum_rew += reward

            self.logger.store(TestEpRet=sum_rew)
コード例 #9
0
def ddpg_mixed_n_step(env_name, ac_kwargs=dict(), seed=0, new_mlp=True, dropout_rate = 0,
                      steps_per_epoch=5000, epochs=100, replay_size=int(1e6),
                      lambda_value = 0.8,
                      n_step_start = 1,
                      n_step_end = 5,
                      rejection_method = 'no_rejection_average_weight',
                          n_step=1,
                          random_n_step=False, random_n_step_low=1, random_n_step_high=5,
                          gamma=0.99, without_delay_train=False, obs_noise_scale=0,
                          nonstationary_env=False,
                          gravity_change_pattern = 'gravity_averagely_equal',
                          gravity_cycle = 1000, gravity_base = -9.81,
                          polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
                          act_noise=0.1, random_action_baseline=False,
                          max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    n_step_size = n_step_end - (n_step_start - 1)
    x_ph = tf.placeholder(dtype=tf.float32, shape=(None, obs_dim))
    a_ph = tf.placeholder(dtype=tf.float32, shape=(None, act_dim))
    x2_ph = tf.placeholder(dtype=tf.float32, shape=(None, n_step_size, obs_dim))
    r_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    d_ph = tf.placeholder(dtype=tf.float32, shape=(None, None))
    n_step_ph = tf.placeholder(dtype=tf.float32, shape=())

    hidden_sizes = list(ac_kwargs['hidden_sizes'])
    actor_hidden_activation = tf.keras.activations.relu
    actor_output_activation = tf.keras.activations.tanh
    critic_hidden_activation = tf.keras.activations.relu
    critic_output_activation = tf.keras.activations.linear

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        actor = MLP(layer_sizes=hidden_sizes + [act_dim],
                    hidden_activation=actor_hidden_activation, output_activation=actor_output_activation)
        critic = MLP(layer_sizes=hidden_sizes + [1],
                     hidden_activation=critic_hidden_activation, output_activation=critic_output_activation)
        # Set training=False to ignore dropout masks
        pi = act_limit * actor(x_ph, training=False)
        q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1)), axis=1)
        q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1)), axis=1)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        actor_targ = MLP(layer_sizes=hidden_sizes + [act_dim],
                         hidden_activation=actor_hidden_activation, output_activation=actor_output_activation)
        critic_targ = MLP(layer_sizes=hidden_sizes + [1],
                          hidden_activation=critic_hidden_activation, output_activation=critic_output_activation)

        # Set training=False to ignore dropout for backup target value
        n_step_q_pi_targ = []
        for n_step_i in range(n_step_size):
            n_step_x2 = tf.squeeze(tf.slice(x2_ph, [0, n_step_i, 0], [batch_size, 1, obs_dim]), axis=1)

            n_step_pi_targ = act_limit * actor_targ(n_step_x2)
            n_step_q_pi_targ.append(
                tf.squeeze(critic_targ(tf.concat([n_step_x2, n_step_pi_targ], axis=-1)), axis=1))

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)

    # Bellman backup for Q function
    n_step_backup_list = []
    n_step_backup_weight_list = []
    n_step_backup_weighted_list = []
    for n_step in range(n_step_start, n_step_end+1):
        print(n_step)
        if n_step <= n_step_end - 1:
            n_step_weight = (1 - lambda_value) * lambda_value ** (n_step - n_step_start)
        else:
            n_step_weight = lambda_value ** (n_step_end - n_step_start)

        n_step_backup = tf.stop_gradient(tf.reduce_sum(tf.multiply(tf.pow(gamma, tf.range(0, n_step, dtype=tf.float32))
                                                                 * (1 - tf.slice(d_ph, [0, 0], [batch_size, n_step])),
                                                                 tf.slice(r_ph, [0, 0], [batch_size, n_step])), axis=1)
                                       + gamma ** n_step * (1 - tf.reshape(tf.slice(d_ph, [0, n_step], [batch_size, 1]), [-1])) * n_step_q_pi_targ[n_step-n_step_start])
        n_step_backup_list.append(n_step_backup)
        n_step_backup_weight_list.append(n_step_weight)
        n_step_backup_weighted_list.append(n_step_weight*n_step_backup)

    # TODO: could we consider standard deviation of n-step bootstrapped Q and reject outliers?
    #   because for different states the extent of overestimation might be different!


    # DDPG losses
    # 1. pi loss
    pi_loss = -tf.reduce_mean(q_pi)

    all_n_step_backup = tf.stack(n_step_backup_list, axis=1)

    if rejection_method == 'no_rejection_average_weight':
        q_loss = tf.reduce_mean((q - tf.reduce_mean(all_n_step_backup, axis=1)) ** 2)
    elif rejection_method == 'no_rejection_lambda_weight':
        q_loss = tf.reduce_mean((q - tf.reduce_sum(tf.stack(n_step_backup_weighted_list, axis=1), axis=1)) ** 2)
    elif rejection_method == 'mean_and_std_rejection':
        # 1. Meand and Standard Deviation rejection
        rejection_deviation_scale = 3
        all_n_step_backup_std = tf.math.reduce_std(all_n_step_backup, axis=1)
        all_n_step_backup_mean = tf.math.reduce_mean(all_n_step_backup, axis=1)
        rejection_upper_bound = tf.reshape(all_n_step_backup_mean + rejection_deviation_scale * all_n_step_backup_std,
                                 shape=(batch_size, 1))
        rejection_lower_bound = tf.reshape(all_n_step_backup_mean - rejection_deviation_scale * all_n_step_backup_std,
                                 shape=(batch_size, 1))
        # mean-std<= kept values < mean+std
        kept_mask = tf.dtypes.cast(tf.math.logical_and(tf.math.less(all_n_step_backup, rejection_upper_bound),
                                                       tf.math.greater(all_n_step_backup, rejection_lower_bound)), tf.float32)
        mean_backup_after_rejection = tf.reduce_sum(tf.math.multiply(all_n_step_backup, kept_mask), axis=1) / tf.reduce_sum(
            kept_mask, axis=1)
        q_loss = tf.reduce_mean((q - mean_backup_after_rejection) ** 2)
    elif rejection_method == 'interquartile_rejection':
        # 2. Interquartile rejection
        reject_low_k = 1
        reject_top_k = 1
        n_size = n_step_end - n_step_start + 1
        after_rejct_top_k = tf.math.top_k(-all_n_step_backup, n_size - reject_top_k)[0]
        after_rejct_low_k = tf.math.top_k(-after_rejct_top_k, n_size - reject_low_k)[0]
        q_loss = tf.reduce_mean((q-tf.reduce_mean(after_rejct_low_k, axis=1))**2)

    # import pdb; pdb.set_trace()

    # q_loss = tf.reduce_mean((q - tf.reduce_sum(tf.stack(n_step_backup_weighted_list, axis=1), axis=1)) ** 2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables)
    train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables)

    # Polyak averaging for target variables
    target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                              for v_main, v_targ in zip(actor.variables + critic.variables,
                                                        actor_targ.variables + critic_targ.variables)])

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(actor.variables + critic.variables,
                                                      actor_targ.variables + critic_targ.variables)])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1,-1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = get_action(o, 0)
                # print(a)
                o, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    # # TODO: delete env.render()
    # env.render()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps and not random_action_baseline:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        #env.render()
        # Manipulate environment
        change_scale = 1/8
        if nonstationary_env == True:
            if gravity_change_pattern == 'gravity_averagely_equal':
                # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base / 2
                gravity = gravity_base + np.abs(gravity_base) * change_scale * np.sin(2 * np.pi / gravity_cycle * t)
            elif gravity_change_pattern == 'gravity_averagely_easier':
                # gravity = gravity_base * 1 / 2 * (np.cos(2 * np.pi / gravity_cycle * t) + 1)
                gravity = gravity_base * change_scale * (np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * ( 1 - change_scale)
            elif gravity_change_pattern == 'gravity_averagely_harder':
                # gravity = gravity_base * 1 / 2 * (-np.cos(2 * np.pi / gravity_cycle * t) + 1) + gravity_base
                gravity = gravity_base * change_scale * (-np.cos(2 * np.pi / gravity_cycle * t)) + gravity_base * (
                            1 + change_scale)
            else:
                pass

            if 'PyBulletEnv' in env_name:
                env.env._p.setGravity(0, 0, gravity)
            elif 'Roboschool' in env_name:
                pass
            else:
                env.model.opt.gravity[2] = gravity
        # Step the env
        o2, r, d, _ = env.step(a)
        # Add observation noise
        o2 += obs_noise_scale * np.random.randn(obs_dim)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        if t > batch_size and without_delay_train:
            if random_n_step:
                n_step = np.random.randint(random_n_step_low, random_n_step_high + 1, 1)[0]

            batch = replay_buffer.sample_batch_mixed_n_step(batch_size,
                                                            n_step_start=n_step_start,
                                                            n_step_end=n_step_end)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done']
                         }
            # import pdb; pdb.set_trace()
            # Q-learning update
            outs = sess.run([q_loss, q, n_step_backup_weighted_list, train_q_op], feed_dict)
            logger.store(LossQ=outs[0], QVals=outs[1])
            logger.store(LossQ=outs[0], QVals=outs[1])
            logger.store(**{'{}Step_Backup'.format(i): outs[2][i-n_step_start] for i in range(n_step_start, n_step_end + 1)})

            # Policy update
            outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
            logger.store(LossPi=outs[0])


        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            if not without_delay_train:
                for _ in range(ep_len):
                    if random_n_step:
                        n_step = np.random.randint(random_n_step_low, random_n_step_high+1, 1)[0]
                    batch = replay_buffer.sample_batch_mixed_n_step(batch_size,
                                                                    n_step_start=n_step_start,
                                                                    n_step_end=n_step_end)
                    feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                }
                    # mean_kept_mask = sess.run(tf.reduce_mean(tf.reduce_sum(kept_mask, axis=1)), feed_dict)
                    # print('mean_kept_mask={}'.format(mean_kept_mask))
                    # all_backup = sess.run(all_n_step_backup, feed_dict)
                    # upper_bound = sess.run(rejection_upper_bound, feed_dict)
                    # lower_bound = sess.run(rejection_lower_bound, feed_dict)
                    # # import pdb; pdb.set_trace()
                    # print('all_backup[0,:]={}, [{},{}] '.format(all_backup[0,:],lower_bound[0], upper_bound[0]))
                    # Q-learning update
                    # Q-learning update
                    outs = sess.run([q_loss, q, n_step_backup_weighted_list, train_q_op], feed_dict)
                    logger.store(LossQ=outs[0], QVals=outs[1])
                    logger.store(**{'{}Step_Backup'.format(i): outs[2][i-n_step_start] for i in range(n_step_start, n_step_end+1)})

                    # Policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            for i in range(n_step_start, n_step_end+1):
                logger.log_tabular('{}Step_Backup'.format(i), average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #10
0
ファイル: elu_ddpg2.py プロジェクト: LinghengMeng/spinningup
def elu_ddpg(
        env_fn,
        render_env=False,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        # TODO: change back to 10000
        start_steps=10000,  #start_steps=10000,
        reward_scale=5,
        act_noise=0.1,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    # x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)

    x_ph, \
    a_ph, a_mu_ph, a_alpha_ph, a_beta_ph, \
    x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, act_dim, act_dim, int(act_dim*(act_dim-1)/2), obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, pi_mu, pi_alpha, pi_beta, pi_cov, q, q_pi, q_pi_mu = actor_critic(
            x_ph, a_ph, **ac_kwargs)
        # pi, q, q_mu, q_sigma, q_pi, q_pi_mu, q_pi_sigma = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, pi_mu_targ, pi_alpha_targ, pi_beta_targ, pi_cov_targ, _, q_pi_targ, q_pi_mu_targ = actor_critic(
            x2_ph, a_ph, **ac_kwargs)
        # pi_targ, _, _, _, q_pi_targ, q_pi_mu_targ, q_pi_sigma_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 logger_fname='experiences_log.txt',
                                 **logger_kwargs)

    # # Count variables
    # var_counts = tuple(core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    # print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n'%var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    # TODO: add term to penalize large variance, give penalize term cofficient
    #
    # pi_loss = tf.reduce_mean(-q_pi +
    #                          (1/act_dim) * tf.norm(pi_alpha,ord=2,axis=1) +
    #                          1/(act_dim*(act_dim-1)/2) * tf.norm(pi_beta,ord=1,axis=1))
    # Option 1. (pass)
    pi_loss = tf.reduce_mean(-q_pi)
    # Option 2. (pass)
    # pi_loss = tf.reduce_mean(-q_pi-q_pi_mu)
    # Option 3. (pass)
    # pi_loss = tf.reduce_mean(-q_pi-tf.linalg.logdet(pi_cov))
    # Option 4. (pass)
    # pi_loss = tf.reduce_mean(-q_pi - tf.linalg.logdet(tf.linalg.inv(pi_cov)))
    # Option 5.
    # pi_loss = tf.reduce_mean(-q_pi/2 -q_pi_mu/2 - tf.linalg.logdet(tf.linalg.inv(pi_cov)))
    # Option 5.
    # pi_loss = tf.reduce_mean(-q_pi/2 -q_pi_mu/2 - 0.001*tf.linalg.logdet(pi_cov))
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # import pdb; pdb.set_trace()
    writer = tf.summary.FileWriter(
        osp.join(logger_kwargs['output_dir'], 'graph'), sess.graph)
    writer.flush()
    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi_mu': pi_mu,
                              'pi_alpha': pi_alpha,
                              'pi_beta': pi_beta,
                              'q': q
                          })

    def get_action(o):
        # import pdb; pdb.set_trace()
        # a_mu, a_alpha, a_beta, a_cov = sess.run([pi_mu, pi_alpha, pi_beta, pi_cov], feed_dict={x_ph: o.reshape(1,-1)})
        #
        # if np.any(np.linalg.eigvals(a_cov[0])<=0):
        #     import pdb;pdb.set_trace()
        a, a_mu, a_alpha, a_beta, a_cov = sess.run(
            [pi, pi_mu, pi_alpha, pi_beta, pi_cov],
            feed_dict={x_ph: o.reshape(1, -1)})
        a, a_mu, a_alpha, a_beta, a_cov = a[0], a_mu[0], a_alpha[0], a_beta[
            0], a_cov[0]

        return a, a_mu, a_alpha, a_beta, a_cov

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a, a_mu, a_alpha, a_beta, a_cov = get_action(o)
                o, r, d, _ = test_env.step(a)
                # o, r, d, _ = test_env.step(a_mu)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a, a_mu, a_alpha, a_beta, a_cov = get_action(o)
            # import pdb; pdb.set_trace()
            print(a_alpha)
        else:
            a = env.action_space.sample()
            a_mu = a
            a_alpha = np.zeros((act_dim, ))
            a_beta = np.zeros((int(act_dim * (act_dim - 1) / 2), ))
            a_cov = np.zeros((act_dim, act_dim))

        # Step the env
        if render_env:
            env.render()
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        # TODO: use determinant as extrinsic reward
        print('np.linalg.det(a_cov)={}'.format(np.linalg.det(a_cov)))
        replay_buffer.store(o, a, a_mu, a_alpha, a_beta, a_cov,
                            reward_scale * (r + np.linalg.det(a_cov)), o2, d,
                            t, steps_per_epoch, start_time)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            # print('training ...')
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    a_mu_ph: batch['acts_mu'],
                    a_alpha_ph: batch['acts_alpha'],
                    a_beta_ph: batch['acts_beta'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # import pdb; pdb.set_trace()
                #
                # outs = sess.run([pi_mu, pi_alpha, pi_beta], feed_dict)

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])
                if outs[0] > 10000:
                    print('q_loss={}'.format(outs[0]))
                    # import pdb;
                    # pdb.set_trace()
                # Policy update
                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            # print('training done.')

        # if t%1000 == 0:
        #     print('step={}'.format(t))
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            # TODO: change test number
            test_agent(2)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #11
0
class sac_discrete_class:
    def __init__(self,
                 env_fn,
                 Actor=core.DiscreteMLPActor,
                 Critic=core.DiscreteMLPQFunction,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=4000,
                 epochs=100,
                 replay_size=int(5e5),
                 gamma=0.99,
                 polyak=0.995,
                 lr=1e-3,
                 alpha=0.2,
                 batch_size=100,
                 start_steps=10000,
                 update_after=1000,
                 update_times_every_step=50,
                 num_test_episodes=10,
                 max_ep_len=100000,
                 logger_kwargs=dict(),
                 save_freq=1,
                 automatic_entropy_tuning=True,
                 use_gpu=False,
                 gpu_parallel=False,
                 show_test_render=False,
                 last_save_path=None,
                 **kwargs):
        """
        Soft Actor-Critic (SAC)


        Args:
            env_fn : A function which creates a copy of the environment.
                The environment must satisfy the OpenAI Gym API.

            actor_critic: The constructor method for a PyTorch Module with an ``act``
                method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
                The ``act`` method and ``pi`` module should accept batches of
                observations as inputs, and ``q1`` and ``q2`` should accept a batch
                of observations and a batch of actions as inputs. When called,
                ``act``, ``q1``, and ``q2`` should return:

                ===========  ================  ======================================
                Call         Output Shape      Description
                ===========  ================  ======================================
                ``act``      (batch, act_dim)  | Numpy array of actions for each
                                               | observation.
                ``q1``       (batch,)          | Tensor containing one current estimate
                                               | of Q* for the provided observations
                                               | and actions. (Critical: make sure to
                                               | flatten this!)
                ``q2``       (batch,)          | Tensor containing the other current
                                               | estimate of Q* for the provided observations
                                               | and actions. (Critical: make sure to
                                               | flatten this!)
                ===========  ================  ======================================

                Calling ``pi`` should return:

                ===========  ================  ======================================
                Symbol       Shape             Description
                ===========  ================  ======================================
                ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                               | given observations.
                ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                               | actions in ``a``. Importantly: gradients
                                               | should be able to flow back into ``a``.
                ===========  ================  ======================================

            ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
                you provided to SAC.

            seed (int): Seed for random number generators.

            steps_per_epoch (int): Number of steps of interaction (state-action pairs)
                for the agent and the environment in each epoch.

            epochs (int): Number of epochs to run and train agent.

            replay_size (int): Maximum length of replay buffer.

            gamma (float): Discount factor. (Always between 0 and 1.)

            polyak (float): Interpolation factor in polyak averaging for target
                networks. Target networks are updated towards main networks
                according to:

                .. math:: \\theta_{\\text{targ}} \\leftarrow
                    \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

                where :math:`\\rho` is polyak. (Always between 0 and 1, usually
                close to 1.)

            lr (float): Learning rate (used for both policy and value learning).

            alpha (float): Entropy regularization coefficient. (Equivalent to
                inverse of reward scale in the original SAC paper.)

            batch_size (int): Minibatch size for SGD.

            start_steps (int): Number of steps for uniform-random action selection,
                before running real policy. Helps exploration.

            update_after (int): Number of env interactions to collect before
                starting to do gradient descent updates. Ensures replay buffer
                is full enough for useful updates.

            update_times_every_step (int): Number of env interactions that should elapse
                between gradient descent updates. Note: Regardless of how long
                you wait between updates, the ratio of env steps to gradient steps
                is locked to 1.

            num_test_episodes (int): Number of episodes to test the deterministic
                policy at the end of each epoch.

            max_ep_len (int): Maximum length of trajectory / episode / rollout.

            logger_kwargs (dict): Keyword args for EpochLogger.

            save_freq (int): How often (in terms of gap between epochs) to save
                the current policy and value function.

        """
        self.ac_kwargs = ac_kwargs
        self.seed = seed
        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.replay_size = replay_size
        self.gamma = gamma
        self.polyak = polyak
        self.lr = lr
        self.alpha = alpha
        self.batch_size = batch_size
        self.start_steps = start_steps
        self.update_after = update_after
        self.update_times_every_step = update_times_every_step
        self.num_test_episodes = num_test_episodes
        self.max_ep_len = max_ep_len
        self.logger_kwargs = logger_kwargs
        self.save_freq = save_freq
        self.automatic_entropy_tuning = automatic_entropy_tuning
        self.use_gpu = use_gpu
        self.gpu_parallel = gpu_parallel
        self.show_test_render = show_test_render
        self.last_save_path = last_save_path
        self.kwargs = kwargs

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(locals())

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.env = env_fn()
        self.test_env = env_fn()

        self.env.seed(seed)
        # env.seed(seed)
        # test_env.seed(seed)
        self.obs_dim = self.env.observation_space.shape
        self.act_dim = self.env.action_space.n

        # Create actor-critic module and target networks
        self.actor = Actor(self.obs_dim, self.act_dim, **ac_kwargs)
        self.critic1 = Critic(self.obs_dim, self.act_dim, **ac_kwargs)
        self.critic2 = Critic(self.obs_dim, self.act_dim, **ac_kwargs)

        self.critic1_targ = deepcopy(self.critic1)
        self.critic2_targ = deepcopy(self.critic2)
        # gpu是否使用
        if torch.cuda.is_available():
            self.device = torch.device("cuda" if self.use_gpu else "cpu")
            if gpu_parallel:
                self.actor = torch.nn.DataParallel(self.actor)
                self.critic1 = torch.nn.DataParallel(self.critic1)
                self.critic2 = torch.nn.DataParallel(self.critic2)
                self.critic1_targ = torch.nn.DataParallel(self.critic1_targ)
                self.critic2_targ = torch.nn.DataParallel(self.critic2_targ)
        else:
            self.use_gpu = False
            self.gpu_parallel = False
            self.device = torch.device("cpu")
        # Freeze target networks with respect to optimizers (only update via polyak averaging)
        for p in self.critic1_targ.parameters():
            p.requires_grad = False
        for p in self.critic2_targ.parameters():
            p.requires_grad = False
        self.actor.to(self.device)
        self.critic1.to(self.device)
        self.critic2.to(self.device)
        self.critic1_targ.to(self.device)
        self.critic2_targ.to(self.device)

        # Experience buffer
        self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                          act_dim=1,
                                          size=replay_size,
                                          device=self.device)

        # # List of parameters for both Q-networks (save this for convenience)
        # q_params = itertools.chain(critic1.parameters(), critic2.parameters())

        if self.automatic_entropy_tuning:
            # we set the max possible entropy as the target entropy
            self.target_entropy = -np.log((1.0 / self.act_dim)) * 0.98
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optim = Adam([self.log_alpha], lr=lr, eps=1e-4)

        # Count variables (protip: try to get a feel for how different size networks behave!)
        var_counts = tuple(
            core.count_vars(module)
            for module in [self.actor, self.critic1, self.critic2])
        self.logger.log(
            '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
            var_counts)

        # Set up optimizers for policy and q-function
        self.pi_optimizer = Adam(self.actor.parameters(), lr=lr)
        self.q1_optimizer = Adam(self.critic1.parameters(), lr=lr)
        self.q2_optimizer = Adam(self.critic2.parameters(), lr=lr)

        if last_save_path is not None:
            checkpoints = torch.load(last_save_path)
            self.epoch = checkpoints['epoch']
            self.actor.load_state_dict(checkpoints['actor'])
            self.critic1.load_state_dict(checkpoints['critic1'])
            self.critic2.load_state_dict(checkpoints['critic2'])
            self.pi_optimizer.load_state_dict(checkpoints['pi_optimizer'])
            self.q1_optimizer.load_state_dict(checkpoints['q1_optimizer'])
            self.q2_optimizer.load_state_dict(checkpoints['q2_optimizer'])
            self.critic1_targ.load_state_dict(checkpoints['critic1_targ'])
            self.critic2_targ.load_state_dict(checkpoints['critic2_targ'])

            # last_best_Return_per_local = checkpoints['last_best_Return_per_local']
            print("succesfully load last prameters")
        else:
            self.epoch = 0

            print("Dont load last prameters.")

    # Set up function for computing SAC Q-losses
    def compute_loss_q(self, data):

        # Bellman backup for Q functions
        with torch.no_grad():
            o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
                'obs2'], data['done']

            r = r.unsqueeze(-1) if r.ndim == 1 else r

            d = d.unsqueeze(-1) if d.ndim == 1 else d
            # Target actions come from *current* policy
            a2, (a2_p, logp_a2), _ = self.get_action(o2)

            # Target Q-values
            q1_pi_targ = self.critic1_targ(o2)
            q2_pi_targ = self.critic2_targ(o2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            min_qf_next_target = a2_p * (q_pi_targ - self.alpha * logp_a2)
            min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1)
            backup = r + self.gamma * (1 - d) * min_qf_next_target

        q1 = self.critic1(o).gather(1, a.long())
        q2 = self.critic2(o).gather(1, a.long())
        # MSE loss against Bellman backup
        loss_q1 = F.mse_loss(q1, backup)
        loss_q2 = F.mse_loss(q2, backup)

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q1, loss_q2, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(self, data):
        state_batch = data['obs']
        action, (action_probabilities,
                 log_action_probabilities), _ = self.get_action(state_batch)
        qf1_pi = self.critic1(state_batch)
        qf2_pi = self.critic2(state_batch)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        inside_term = self.alpha * log_action_probabilities - min_qf_pi
        policy_loss = action_probabilities * inside_term
        policy_loss = policy_loss.mean()
        log_action_probabilities = torch.sum(log_action_probabilities *
                                             action_probabilities,
                                             dim=1)
        # Useful info for logging
        pi_info = dict(LogPi=log_action_probabilities.detach().cpu().numpy())

        return policy_loss, log_action_probabilities, pi_info

    def take_optimisation_step(self,
                               optimizer,
                               network,
                               loss,
                               clipping_norm=None,
                               retain_graph=False):
        if not isinstance(network, list):
            network = [network]
        optimizer.zero_grad()  # reset gradients to 0
        loss.backward(
            retain_graph=retain_graph)  # this calculates the gradients
        if clipping_norm is not None:
            for net in network:
                torch.nn.utils.clip_grad_norm_(
                    net.parameters(),
                    clipping_norm)  # clip gradients to help stabilise training
        optimizer.step()  # this applies the gradients

    def soft_update_of_target_network(self, local_model, target_model, tau):
        """Updates the target network in the direction of the local network but by taking a step size
        less than one so the target network's parameter values trail the local networks. This helps stabilise training"""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def update(self, data):
        # First run one gradient descent step for Q1 and Q2

        loss_q1, loss_q2, q_info = self.compute_loss_q(data)
        self.take_optimisation_step(
            self.q1_optimizer,
            self.critic1,
            loss_q1,
            5,
        )
        self.take_optimisation_step(
            self.q2_optimizer,
            self.critic2,
            loss_q2,
            5,
        )

        # Record things
        self.logger.store(LossQ=(loss_q1.item() + loss_q2.item()) / 2.,
                          **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # # computing gradients for them during the policy learning step.
        # for p in q_params:
        #     p.requires_grad = False

        # Next run one gradient descent step for pi.

        loss_pi, log_pi, pi_info = self.compute_loss_pi(data)
        # Record things
        self.logger.store(LossPi=loss_pi.item(), **pi_info)

        # # Unfreeze Q-networks so you can optimize it at next DDPG step.
        # for p in q_params:
        #     p.requires_grad = True

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()
            # logger.store(alpha_loss=alpha_loss.item())

        self.take_optimisation_step(
            self.pi_optimizer,
            self.actor,
            loss_pi,
            5,
        )

        with torch.no_grad():
            for p, p_targ in zip(self.critic1.parameters(),
                                 self.critic1_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)
            for p, p_targ in zip(self.critic2.parameters(),
                                 self.critic2_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(self.polyak)
                p_targ.data.add_((1 - self.polyak) * p.data)

        if self.automatic_entropy_tuning:
            self.take_optimisation_step(self.alpha_optim, None, alpha_loss,
                                        None)
            self.alpha = self.log_alpha.exp()

    def get_action(self, state):
        """Given the state, produces an action, the probability of the action, the log probability of the action, and
        the argmax action"""
        action_probabilities = self.actor(state)
        max_probability_action = torch.argmax(action_probabilities).unsqueeze(
            0)
        action_distribution = Categorical(action_probabilities)
        action = action_distribution.sample().cpu()
        # Have to deal with situation of 0.0 probabilities because we can't do log 0
        z = action_probabilities == 0.0
        z = z.float() * 1e-8
        log_action_probabilities = torch.log(action_probabilities + z)
        return action, (action_probabilities,
                        log_action_probabilities), max_probability_action

    def test_agent(self):
        for j in range(self.num_test_episodes):
            o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0
            while not (d or (ep_len == self.max_ep_len)):
                if self.show_test_render:
                    self.test_env.render()
                # Take deterministic actions at test time
                with torch.no_grad():
                    _, (_, _), a = self.get_action(
                        torch.FloatTensor([o]).to(self.device))
                o, r, d, _ = self.test_env.step(a.cpu().item())
                ep_ret += r
                ep_len += 1
                text = "\r\x1b[32mEpoch: %s,  TestEp_ret: %s,  Testep_len: %s.\x1b[0m" % \
                       (self.epoch, ep_ret, ep_len)
                sys.stdout.write(text)
                sys.stdout.flush()
            self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def run(self):
        # Prepare for interaction with environment
        total_steps = self.steps_per_epoch * self.epochs
        start_time = time.time()
        o, ep_ret, ep_len = self.env.reset(), 0, 0
        eps = 1

        t = self.epoch * self.steps_per_epoch if self.last_save_path is not None else 0

        # Main loop: collect experience in env and update/log each epoch
        self.actor.eval()
        while t < total_steps:

            # Until start_steps have elapsed, randomly sample actions
            # from a uniform distribution for better exploration. Afterwards,
            # use the learned policy.
            if t >= self.start_steps:
                with torch.no_grad():
                    a, _, _ = self.get_action(torch.FloatTensor([o]).to(self.device)) if o.shape == self.obs_dim else \
                        self.get_action(torch.FloatTensor(o).to(self.device))
                    a = a.cpu().item()
            else:
                a = np.random.randint(0, self.act_dim)

            # Step the env
            o2, r, d, _ = self.env.step(a)
            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == self.max_ep_len else d

            # Store experience to replay buffer
            self.replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            # End of trajectory handling
            if d or (ep_len
                     == self.max_ep_len):  # ep_len == max_ep_len是游戏成功时最少ep长度
                self.logger.store(EpRet=ep_ret, EpLen=ep_len)
                text = "\r\x1b[32mEpoch: %s,  Episode: %s,  Ep_ret: %s,  ep_len: %s. [%s/%s] \x1b[0m" % \
                       (self.epoch, eps, ep_ret, ep_len, t+1,  total_steps)
                sys.stdout.write(text)
                sys.stdout.flush()
                o, ep_ret, ep_len = self.env.reset(), 0, 0
                # if eps % 30 == 0:
                #     logger.log('\nEpisode: %s\n,\tEp_ret: %s,\tep_len: %s' % (eps, ep_ret,ep_len))
                eps += 1

            # Update handling
            if t >= self.update_after and t % self.update_times_every_step == 0:
                self.actor.train()
                for j in range(self.update_times_every_step):
                    batch = self.replay_buffer.sample_batch(self.batch_size)
                    self.update(data=batch)

                self.actor.eval()
                # logger.save_epoch_Ret_optimizer_model(save_dict)
                # last_best_Return_per_local = Return_per_local
            # End of epoch handling
            if (
                    t + 1
            ) % self.steps_per_epoch == 0 and t > self.update_after:  # steps_perepoch步 and 大于update_after步
                if (
                        t + 1
                ) % self.update_times_every_step == 0:  # 每达到update_times_every_step
                    self.epoch = (t + 1) // self.steps_per_epoch

                    # Save model
                    if proc_id() == 0 and (self.epoch) % self.save_freq == 0:
                        save_dict = {
                            'epoch': self.epoch,
                            'actor': self.actor.state_dict(),
                            'critic1': self.critic1.state_dict(),
                            'critic2': self.critic2.state_dict(),
                            'pi_optimizer': self.pi_optimizer.state_dict(),
                            'q1_optimizer': self.q1_optimizer.state_dict(),
                            'q2_optimizer': self.q2_optimizer.state_dict(),
                            'critic1_targ': self.critic1_targ.state_dict(),
                            'critic2_targ': self.critic2_targ.state_dict(),
                        }
                        self.logger.save_epoch_Ret_optimizer_model(
                            save_dict, self.epoch)

                    self.actor.eval()
                    # Test the performance of the deterministic version of the agent.
                    self.test_agent()

                    # Log info about epoch
                    self.logger.log_tabular('Epoch', self.epoch)
                    self.logger.log_tabular('EpRet', with_min_and_max=True)
                    self.logger.log_tabular('TestEpRet',
                                            with_min_and_max=False)
                    self.logger.log_tabular('EpLen', average_only=True)
                    self.logger.log_tabular('TestEpLen', average_only=True)
                    self.logger.log_tabular('TotalEnvInteracts', t)
                    self.logger.log_tabular('Q1Vals', with_min_and_max=True)
                    self.logger.log_tabular('Q2Vals', with_min_and_max=True)
                    self.logger.log_tabular('LogPi', with_min_and_max=True)
                    self.logger.log_tabular('LossPi', average_only=True)
                    self.logger.log_tabular('LossQ', average_only=True)
                    self.logger.log_tabular('Time', time.time() - start_time)
                    # if epoch > 1:
                    #     (time.time() - start_time)/epo
                    self.logger.dump_tabular()

            t += 1
コード例 #12
0
ファイル: ddpg.py プロジェクト: MLRG-CEFET-RJ/DRL-ALM
def ddpg(env_fn=core.ALMEnv,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=300,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         act_noise=.01,
         num_test_episodes=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         time_horizon=80,
         discount_rate=.06):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.
            In this version, the default environment is 'ALMEnv'

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # env, test_env = env_fn(), env_fn() original OpenAI SpinningUp entry
    env = env_fn(T=time_horizon, rate=discount_rate)  # Added by the author
    test_env = env_fn(T=time_horizon,
                      rate=discount_rate)  # Added by the author
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q = ac.q(o, a)

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a = a * (noise_scale * np.random.randn(act_dim) + 1
                 )  # Added by the author
        return (a / np.sum(a))  # Added by the author
        # a += noise_scale * np.random.randn(act_dim) Original OpenAI SpinningUp entry
        # return np.clip(a, -act_limit, act_limit) Original OpenAI SpinningUp entry

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #13
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space below.                     #
    #                                                                         #
    #=========================================================================#

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        #######################
        #                     #
        #   YOUR CODE HERE    #
        #                     #
        #######################
        # pi, q1, q2, q1_pi =
        pass

    # Target policy network
    with tf.variable_scope('target'):
        #######################
        #                     #
        #   YOUR CODE HERE    #
        #                     #
        #######################
        # pi_targ =
        pass

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        #######################
        #                     #
        #   YOUR CODE HERE    #
        #                     #
        #######################

        # Target Q-values, using action from smoothed target policy
        #######################
        #                     #
        #   YOUR CODE HERE    #
        #                     #
        #######################
        pass

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    #######################
    #                     #
    #   YOUR CODE HERE    #
    #                     #
    #######################

    # TD3 losses
    #######################
    #                     #
    #   YOUR CODE HERE    #
    #                     #
    #######################
    # pi_loss =
    # q1_loss =
    # q2_loss =
    # q_loss =

    #=========================================================================#
    #                                                                         #
    #           All of your code goes in the space above.                     #
    #                                                                         #
    #=========================================================================#

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #14
0
def sac_adapt_fast(
        env_fn,
        hidden_sizes=[256, 256],
        seed=0,
        steps_per_epoch=1000,
        epochs=1000,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=3e-4,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        max_ep_len=1000,
        save_freq=1,
        save_model=False,
        auto_alpha=True,
        grad_clip=-1,
        logger_store_freq=100,
        logger_kwargs=dict(),
):
    """
    Largely following OpenAI documentation, but a bit different
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        hidden_sizes: number of entries is number of hidden layers
            each entry in this list indicate the size of that hidden layer.
            applies to all networks

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch. Note the epoch here is just logging epoch
            so every this many steps a logging to stdouot and also output file will happen
            note: not to be confused with training epoch which is a term used often in literature for all kinds of
            different things

        epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different
            algorithms, use caution. Here every epoch you get new logs

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration. However during testing the action always come from policy

        max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if
        timestep in an episode excedding this number

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_model (bool): set to True if want to save the trained agent

        auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically

        grad_clip: whether to use gradient clipping. < 0 means no clipping

        logger_store_freq: how many steps to log debugging info, typically don't need to change

    """
    """set up logger"""
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env, test_env = env_fn(), env_fn()

    ## seed torch and numpy
    torch.manual_seed(seed)
    np.random.seed(seed)

    ## seed environment along with env action space so that everything about env is seeded
    env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.seed(seed + 10000)
    test_env.action_space.np_random.seed(seed + 10000)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # if environment has a smaller max episode length, then use the environment's max episode length
    max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    # we need .item() to convert it from numpy float to python float
    act_limit = env.action_space.high[0].item()

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)
    """
    Auto tuning alpha
    """
    if auto_alpha:
        target_entropy = -np.prod(env.action_space.shape).item()  # H
        log_alpha = torch.zeros(1, requires_grad=True)
        alpha_optim = optim.Adam([log_alpha], lr=lr)
    else:
        target_entropy, log_alpha, alpha_optim = None, None, None

    def test_agent(n=1):
        """
        This will test the agent's performance by running n episodes
        During the runs, the agent only take deterministic action, so the
        actions are not drawn from a distribution, but just use the mean
        :param n: number of episodes to run the agent
        """
        ep_return_list = np.zeros(n)
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a = policy_net.get_env_action(o, deterministic=True)
                o, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
            ep_return_list[j] = ep_ret
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    """init all networks"""
    # see line 1
    policy_net = TanhGaussianPolicySACAdapt(obs_dim,
                                            act_dim,
                                            hidden_sizes,
                                            action_limit=act_limit)
    q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)
    q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)

    q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)
    q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)

    # see line 2: copy parameters from value_net to target_value_net
    q1_target_net.load_state_dict(q1_net.state_dict())
    q2_target_net.load_state_dict(q2_net.state_dict())

    # set up optimizers
    policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr)
    q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr)

    # mean squared error loss for v and q networks
    mse_criterion = nn.MSELoss()

    # Main loop: collect experience in env and update/log each epoch
    # NOTE: t here is the current number of total timesteps used
    # it is not the number of timesteps passed in the current episode
    current_update_index = 0
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = policy_net.get_env_action(o, deterministic=False)
        else:
            a = env.action_space.sample()
        # Step the env, get next observation, reward and done signal
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience (observation, action, reward, next observation, done) to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        """perform update"""
        if replay_buffer.size >= batch_size:
            # get data from replay buffer
            batch = replay_buffer.sample_batch(batch_size)
            obs_tensor = Tensor(batch['obs1'])
            obs_next_tensor = Tensor(batch['obs2'])
            acts_tensor = Tensor(batch['acts'])
            # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n
            # to prevent problems later
            rews_tensor = Tensor(batch['rews']).unsqueeze(1)
            done_tensor = Tensor(batch['done']).unsqueeze(1)
            """
            now we do a SAC update, following the OpenAI spinup doc
            check the openai sac document psudocode part for reference
            line nubmers indicate lines in psudocode part
            we will first compute each of the losses
            and then update all the networks in the end
            """
            # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer)
            """get q loss"""
            with torch.no_grad():
                a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward(
                    obs_next_tensor)
                q1_next = q1_target_net(
                    torch.cat([obs_next_tensor, a_tilda_next], 1))
                q2_next = q2_target_net(
                    torch.cat([obs_next_tensor, a_tilda_next], 1))

                min_next_q = torch.min(q1_next,
                                       q2_next) - alpha * log_prob_a_tilda_next
                y_q = rews_tensor + gamma * (1 - done_tensor) * min_next_q

            # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
            q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1))
            q1_loss = mse_criterion(q1_prediction, y_q)
            q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1))
            q2_loss = mse_criterion(q2_prediction, y_q)
            """
            get policy loss
            """
            a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward(
                obs_tensor)

            # see line 12: second equation
            q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1))
            q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1))
            min_q1_q2_a_tilda = torch.min(q1_a_tilda, q2_a_tilda)

            # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
            policy_loss = (alpha * log_prob_a_tilda - min_q1_q2_a_tilda).mean()
            """
            alpha loss, update alpha
            """
            if auto_alpha:
                alpha_loss = -(
                    log_alpha *
                    (log_prob_a_tilda + target_entropy).detach()).mean()

                alpha_optim.zero_grad()
                alpha_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(log_alpha, grad_clip)
                alpha_optim.step()

                alpha = log_alpha.exp().item()
            else:
                alpha_loss = 0
            """update networks"""
            q1_optimizer.zero_grad()
            q1_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip)
            q1_optimizer.step()

            q2_optimizer.zero_grad()
            q2_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip)
            q2_optimizer.step()

            policy_optimizer.zero_grad()
            policy_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip)
            policy_optimizer.step()

            # see line 16: update target value network with value network
            soft_update_model1_with_model2(q1_target_net, q1_net, polyak)
            soft_update_model1_with_model2(q2_target_net, q2_net, polyak)

            current_update_index += 1
            if current_update_index % logger_store_freq == 0:
                # store diagnostic info to logger
                logger.store(LossPi=policy_loss.item(),
                             LossQ1=q1_loss.item(),
                             LossQ2=q2_loss.item(),
                             LossAlpha=alpha_loss.item(),
                             Q1Vals=q1_prediction.detach().numpy(),
                             Q2Vals=q2_prediction.detach().numpy(),
                             Alpha=alpha,
                             LogPi=log_prob_a_tilda.detach().numpy())

        if d or (ep_len == max_ep_len):
            """when episode terminates, log info about this episode, then reset"""
            ## store episode return and length to logger
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            ## reset environment
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            """
            Save pytorch model, very different from tensorflow version
            We need to save the environment, the state_dict of each network
            and also the state_dict of each optimizer
            """
            if save_model:
                sac_state_dict = {
                    'env': env,
                    'policy_net': policy_net.state_dict(),
                    'q1_net': q1_net.state_dict(),
                    'q2_net': q2_net.state_dict(),
                    'q1_target_net': q1_target_net.state_dict(),
                    'q2_target_net': q2_target_net.state_dict(),
                    'policy_opt': policy_optimizer,
                    'q1_opt': q1_optimizer,
                    'q2_opt': q2_optimizer,
                    'log_alpha': log_alpha,
                    'alpha_opt': alpha_optim,
                    'target_entropy': target_entropy
                }
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    logger.save_state(sac_state_dict, None)
            # use joblib.load(fname) to load

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('Alpha', with_min_and_max=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            sys.stdout.flush()
コード例 #15
0
ファイル: td3_nstep_sil.py プロジェクト: robintyh1/nstep-sil
def td3(env_fn,
        env_fn_test,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        logdir=None,
        nstep=None,
        alpha=None,
        beta=None,
        sil_weight=None):
    """
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.
        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================
        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.
        epochs (int): Number of epochs to run and train agent.
        replay_size (int): Maximum length of replay buffer.
        gamma (float): Discount factor. (Always between 0 and 1.)
        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:
            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)
        pi_lr (float): Learning rate for policy.
        q_lr (float): Learning rate for Q-networks.
        batch_size (int): Minibatch size for SGD.
        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.
        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)
        target_noise (float): Stddev for smoothing noise added to target 
            policy.
        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.
        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
    """
    assert logdir is not None
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    sess = tf.Session()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn_test()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)
    x_ph_sil, a_ph_sil, x2_ph_sil, r_ph_sil, d_ph_sil = core.placeholders(
        obs_dim, act_dim, obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    with tf.variable_scope('main', reuse=True):
        _, q1_sil, q2_sil, _ = actor_critic(x_ph_sil, a_ph_sil, **ac_kwargs)

    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    with tf.variable_scope('target', reuse=True):
        pi_targ_sil, _, _, _ = actor_critic(x2_ph_sil, a_ph_sil, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)

        # Target Q-values, using action from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ_sil), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ_sil + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)

        # Target Q-values, using action from target policy
        _, q1_targ_sil, q2_targ_sil, _ = actor_critic(x2_ph_sil, a2,
                                                      **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Prioritized replay for expert data
    sil_replay_buffer = prioritized_buffer.PrioritizedReplayBuffer(
        size=replay_size, alpha=alpha)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    backup_discount = gamma
    min_q_targ = tf.minimum(q1_targ, q2_targ)
    backup = tf.stop_gradient(r_ph + backup_discount * (1 - d_ph) * min_q_targ)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.reduce_mean((q1 - backup)**2)
    q2_loss = tf.reduce_mean((q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # sil q loss
    backup_discount_nstep = gamma**nstep
    min_q_targ_sil = tf.minimum(q1_targ_sil, q2_targ_sil)
    backup_sil = tf.stop_gradient(r_ph_sil + backup_discount_nstep *
                                  (1 - d_ph_sil) * min_q_targ_sil)

    # TD3 losses
    weights_ph = tf.placeholder(tf.float32, [None])
    gains_1 = tf.nn.relu(backup_sil - q1_sil)
    gains_2 = tf.nn.relu(backup_sil - q2_sil)
    q1_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_1))
    q2_loss_sil = tf.reduce_mean(weights_ph * tf.square(gains_2))
    q_loss_sil = q1_loss_sil + q2_loss_sil
    gains = gains_1 + gains_2

    # add to the q loss
    q_loss += sil_weight * q_loss_sil

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        # test recorder
        ep_ret_list = []
        # set up
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0).flatten())
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            ep_ret_list.append(ep_ret)
        return ep_ret_list

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # record training
    ep_ret_record = []
    time_step_record = []

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise).flatten()
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, info = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d
        if 'nstep_data_1' in info.keys():
            info['nstep_data_1'][-1] = d
        if 'nstep_data_{}'.format(nstep) in info.keys():
            info['nstep_data_{}'.format(nstep)][-1] = d

        # Store experience to replay buffer
        if 'nstep_data_1' in info.keys():
            replay_buffer.store(*info['nstep_data_1'])
            if nstep == 1:
                try:
                    assert info['nstep_data_1'] == [o, a, r, o2, d]
                except:
                    import pdb
                    pdb.set_trace()
        if 'nstep_data_{}'.format(nstep) in info.keys():
            sil_replay_buffer.store(*info['nstep_data_{}'.format(nstep)])

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                batch_sil, weights, batch_idxes = sil_replay_buffer.sample_batch(
                    batch_size, beta=beta)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    x_ph_sil: batch_sil['obs1'],
                    x2_ph_sil: batch_sil['obs2'],
                    a_ph_sil: batch_sil['acts'],
                    r_ph_sil: batch_sil['rews'],
                    d_ph_sil: batch_sil['done'],
                    weights_ph: weights
                }
                q_step_ops = [q_loss, q1, q2, train_q_op] + [gains]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # get the priorities
                new_priorities = outs[-1] + 1e-8
                sil_replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)
                #print_stats('new priorities', new_priorities)

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            ep_rets = test_agent()
            ep_ret_record.append(np.mean(ep_rets))
            time_step_record.append(t)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # save the records
            np.save(logdir + '/ep_rets', ep_ret_record)
            np.save(logdir + '/timesteps', time_step_record)
コード例 #16
0
def oac(env_fn,
        actor_critic=mlp_actor_critic,
        logger_kwargs=dict(),
        network_params=dict(),
        rl_params=dict()):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # control params
    seed = rl_params['seed']
    epochs = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size = rl_params['replay_size']
    batch_size = rl_params['batch_size']
    start_steps = rl_params['start_steps']
    max_ep_len = rl_params['max_ep_len']
    save_freq = rl_params['save_freq']
    render = rl_params['render']

    # rl params
    gamma = rl_params['gamma']
    polyak = rl_params['polyak']
    lr = rl_params['lr']
    grad_clip_val = rl_params['grad_clip_val']

    # entropy params
    alpha = rl_params['alpha']
    target_entropy = rl_params['target_entropy']

    # optimistic exploration params
    use_opt = rl_params['use_opt']
    beta_UB = rl_params['beta_UB']
    beta_LB = rl_params['beta_LB']
    delta = rl_params['delta']

    train_env, test_env = env_fn(), env_fn()
    obs_dim = train_env.observation_space.shape[0]
    act_dim = train_env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = train_env.action_space.high[0]

    # set the seed
    tf.set_random_seed(seed)
    np.random.seed(seed)
    train_env.seed(seed)
    train_env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    # Share information about action space with policy architecture
    network_params['action_space'] = train_env.action_space

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim,
                                                 None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1_a, q2_a, pretanh_mu, std = actor_critic(
            x_ph, a_ph, **network_params)

    with tf.variable_scope('main', reuse=True):

        # compose q with mu
        _, _, _, q1_mu, q2_mu, _, _ = actor_critic(x_ph, mu, **network_params)

        # compose q with pi, for pi-learning
        _, _, _, q1_pi, q2_pi, _, _ = actor_critic(x_ph, pi, **network_params)

        # get actions and log probs of actions for next states, for Q-learning
        _, pi_next, logp_pi_next, _, _, _, _ = actor_critic(
            x2_ph, a_ph, **network_params)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, q1_pi_targ, q2_pi_targ, _, _ = actor_critic(
            x2_ph, pi_next, **network_params)

    # alpha Params
    if target_entropy == 'auto':
        target_entropy = tf.cast(-act_dim, tf.float32)
    else:
        target_entropy = tf.cast(target_entropy, tf.float32)

    log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)

    if alpha == 'auto':  # auto tune alpha
        alpha = tf.exp(log_alpha)
    else:  # fixed alpha
        alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha)

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main'])
    print("""\nNumber of parameters:
             alpha: %d,
             pi: %d,
             q1: %d,
             q2: %d,
             total: %d\n""" % var_counts)

    if use_opt:
        # Optimistic Exploration
        mu_Q = (q1_mu + q2_mu) / 2.0
        sigma_Q = tf.math.abs(q1_mu - q2_mu) / 2.0

        Q_UB = mu_Q + beta_UB * sigma_Q
        Q_LB = mu_Q + beta_LB * sigma_Q

        grad_Q_UB = tf.gradients(Q_UB, pretanh_mu)[0]
        Sigma = tf.math.pow(std, 2)

        denom = tf.math.sqrt(
            tf.math.reduce_sum(
                tf.math.multiply(tf.math.pow(grad_Q_UB, 2), Sigma))) + 10e-6

        mu_C = np.sqrt(2.0 * delta) * tf.math.multiply(Sigma,
                                                       grad_Q_UB) / denom
        mu_E = pretanh_mu + mu_C

        optimistic_pi = tf.tanh(mu_E + tf.random_normal(tf.shape(mu_E)) * std)
        optimistic_pi *= act_limit
    else:
        optimistic_pi = pi  # use standard SAC policy
        Q_LB = tf.minimum(q1_pi, q2_pi)

    # Min Double-Q:
    min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) *
                                (min_q_pi_targ - alpha * logp_pi_next))

    # critic losses
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
    value_loss = q1_loss + q2_loss

    # Soft actor losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - Q_LB)

    # alpha loss for temperature parameter
    alpha_backup = tf.stop_gradient(logp_pi + target_entropy)
    alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup)

    # Policy train op
    # (has to be separate from value train op, because q1_logits appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    if grad_clip_val is not None:
        gvs = pi_optimizer.compute_gradients(pi_loss,
                                             var_list=get_vars('main/pi'))
        capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var)
                      for grad, var in gvs]
        train_pi_op = pi_optimizer.apply_gradients(capped_gvs)
    else:
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_pi_op]):
        if grad_clip_val is not None:
            gvs = value_optimizer.compute_gradients(
                value_loss, var_list=get_vars('main/q'))
            capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var)
                          for grad, var in gvs]
            train_value_op = value_optimizer.apply_gradients(capped_gvs)
        else:
            train_value_op = value_optimizer.minimize(
                value_loss, var_list=get_vars('main/q'))

    alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_value_op]):
        train_alpha_op = alpha_optimizer.minimize(
            alpha_loss, var_list=get_vars('log_alpha'))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy,
        alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op,
        target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x_ph': x_ph,
                              'a_ph': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1_a': q1_a,
                              'q2_a': q2_a
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else optimistic_pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    def test_agent(n=10, render=True):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0

            if render: test_env.render()

            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1

                if render: test_env.render()

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

        if render: test_env.close()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = train_env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy.
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = train_env.action_space.sample()

        # Step the env
        o2, r, d, _ = train_env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }

                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             LogPi=outs[5],
                             TargEntropy=outs[6],
                             LossAlpha=outs[7],
                             Alpha=outs[8])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = train_env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': train_env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(n=4, render=render)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

    plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'),
                  show_plot=False)
コード例 #17
0
def shpo(env_fn,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         device='cpu',
         steps_per_epoch=4000,
         epochs=50,
         replay_size=1000000,
         gamma=0.99,
         polyak=0.005,
         polyak_pi=0.0,
         lr=1e-3,
         batch_size=100,
         expand_batch=100,
         start_steps=10000,
         update_after=10000,
         num_test_episodes=10,
         per_update_steps_for_actor=100,
         per_update_steps_for_critic=50,
         cg_iters=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         algo='shpo'):
    """
    Sinkhorn Policy Optimization (SHPO)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        polyak_pi (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        batch_size (int): Minibatch size for Critic.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # ====== All About Init ===============================================================
    device = torch.device(device)

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env, test_env = env_fn(), env_fn()

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    env.seed(seed)
    test_env.seed(seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print("obs_dim = {}, act_dim = {}".format(obs_dim, act_dim))

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())
    q_optimizer = Adam(q_params, lr=lr)
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)

    # Experience buffer
    replay_buffer = core.ReplayBuffer(obs_dim=obs_dim,
                                      act_dim=act_dim,
                                      size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # ===== End Of Init =========================================================================

    # ===== Critic Loss =========================================================================
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        o = torch.FloatTensor(o).to(device)
        a = torch.FloatTensor(a).to(device)
        r = torch.FloatTensor(r).to(device)
        o2 = torch.FloatTensor(o2).to(device)
        d = torch.FloatTensor(d).to(device)

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions

        with torch.no_grad():
            # Target actions come from *current* policy
            a2 = ac_targ.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    def update_critic(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

            for p, p_targ in zip(ac.pi.parameters(), ac_targ.pi.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak_pi)
                p_targ.data.add_(polyak_pi * p.data)

    # ===== End Of Critic Loss ============================================================================

    # ===== Update Actor ==================================================================================
    def compute_loss_pi(data):
        o = data['obs']
        o = torch.FloatTensor(o).to(device)

        o2 = o.repeat(expand_batch, 1)
        a2 = ac.pi(o2)
        q1_pi = ac.q1(o2, a2)
        q2_pi = ac.q2(o2, a2)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = -q_pi.mean()
        return loss_pi

    def update_actor(data):
        for p in q_params:
            p.requires_grad = False

        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        logger.store(LossPi=loss_pi.item())
        """
        # ??? I am not sure: Do I need zero_grad()?
        loss_pi = compute_loss_pi(data)
        grads = torch.autograd.grad(loss_pi, ac.pi.parameters())
        grads_vector = torch.cat([grad.view(-1) for grad in grads]).data

        def get_Hx(x):
            # Require New Method.

        invHg = core.cg(get_Hx, loss_grad, cg_iters)
        # fullstep = ???

        with torch.no_grad():
            prev_params = core.get_flat_params_from(ac.pi)
            # new_params = ???
            # core.set_flat_params_to(ac.pi, new_params)
        """

        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item())

    # ===== End Of Actor ==================================================================================

    # ===== Start Training ================================================================================
    def get_action(o, deterministic=False):
        # o = replay_buffer.obs_encoder(o)
        o = torch.FloatTensor(o.reshape(1, -1)).to(device)
        a = ac.act(o, deterministic)
        return a

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t <= start_steps:
            a = env.action_space.sample()
        else:
            a = get_action(o, deterministic=False)

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)

        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        ac.obs_std = torch.FloatTensor(replay_buffer.obs_std).to(device)
        ac.obs_mean = torch.FloatTensor(replay_buffer.obs_mean).to(device)
        ac_targ.obs_std = ac.obs_std
        ac_targ.obs_mean = ac.obs_mean

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and (t + 1) % steps_per_epoch == 0:
            for j in range(per_update_steps_for_critic):
                data = replay_buffer.sample_batch(batch_size)
                update_critic(data)

            for j in range(per_update_steps_for_actor):
                data = replay_buffer.sample_recently(steps_per_epoch)
                update_actor(data)

        # End of epoch handling
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Normalize State
            print("obs_mean=" + str(replay_buffer.obs_mean))
            print("obs_std=" + str(replay_buffer.obs_std))
コード例 #18
0
def ddpg_dropout(env_fn,
                 ac_kwargs=dict(),
                 seed=0,
                 new_mlp=True,
                 dropout_rate=0,
                 steps_per_epoch=5000,
                 epochs=100,
                 replay_size=int(1e6),
                 gamma=0.99,
                 polyak=0.995,
                 pi_lr=1e-3,
                 q_lr=1e-3,
                 batch_size=100,
                 start_steps=10000,
                 act_noise=0.1,
                 max_ep_len=1000,
                 logger_kwargs=dict(),
                 save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    hidden_sizes = list(ac_kwargs['hidden_sizes'])
    actor_hidden_activation = tf.keras.activations.relu
    actor_output_activation = tf.keras.activations.tanh
    critic_hidden_activation = tf.keras.activations.relu
    critic_output_activation = tf.keras.activations.linear

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        if not new_mlp:
            actor = BeroulliDropoutMLP(
                layer_sizes=hidden_sizes + [act_dim],
                dropout_rate=dropout_rate,
                hidden_activation=actor_hidden_activation,
                output_activation=actor_output_activation)
            critic = BeroulliDropoutMLP(
                layer_sizes=hidden_sizes + [1],
                dropout_rate=dropout_rate,
                hidden_activation=critic_hidden_activation,
                output_activation=critic_output_activation)
        else:
            actor = MLP(layer_sizes=hidden_sizes + [act_dim],
                        dropout_rate=dropout_rate,
                        hidden_activation=actor_hidden_activation,
                        output_activation=actor_output_activation)
            critic = MLP(layer_sizes=hidden_sizes + [1],
                         dropout_rate=dropout_rate,
                         hidden_activation=critic_hidden_activation,
                         output_activation=critic_output_activation)
        # Set training=False to ignore dropout masks
        pi = act_limit * actor(x_ph, training=False)
        q = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1),
                              training=False),
                       axis=1)
        q_pi = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1),
                                 training=False),
                          axis=1)
        # Set traininig=True to mask input for each hidden layer and output layer
        pi_drop = act_limit * actor(x_ph, training=True)
        q_drop = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1),
                                   training=True),
                            axis=1)
        # 1.
        q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1),
                                      training=True),
                               axis=1)
        # 2.
        q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi_drop], axis=-1),
                                      training=True),
                               axis=1)

        # q_drop = tf.squeeze(critic(tf.concat([x_ph, a_ph], axis=-1), training=False), axis=1)
        # q_pi_drop = tf.squeeze(critic(tf.concat([x_ph, pi], axis=-1), training=False), axis=1)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        if not new_mlp:
            actor_targ = BeroulliDropoutMLP(
                layer_sizes=hidden_sizes + [act_dim],
                dropout_rate=dropout_rate,
                hidden_activation=actor_hidden_activation,
                output_activation=actor_output_activation)
            critic_targ = BeroulliDropoutMLP(
                layer_sizes=hidden_sizes + [1],
                dropout_rate=dropout_rate,
                hidden_activation=critic_hidden_activation,
                output_activation=critic_output_activation)
        else:
            actor_targ = MLP(layer_sizes=hidden_sizes + [act_dim],
                             dropout_rate=dropout_rate,
                             hidden_activation=actor_hidden_activation,
                             output_activation=actor_output_activation)
            critic_targ = MLP(layer_sizes=hidden_sizes + [1],
                              dropout_rate=dropout_rate,
                              hidden_activation=critic_hidden_activation,
                              output_activation=critic_output_activation)

        # Set training=False to ignore dropout for backup target value
        pi_targ = act_limit * actor_targ(x2_ph, training=False)
        q_pi_targ = tf.squeeze(critic_targ(tf.concat([x2_ph, pi_targ],
                                                     axis=-1),
                                           training=False),
                               axis=1)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    # # 1.
    # pi_loss = -tf.reduce_mean(q_pi)
    # q_loss = tf.reduce_mean((q-backup)**2)
    # # 2.
    # pi_loss = -tf.reduce_mean(q_pi_drop)
    # q_loss = tf.reduce_mean((q_drop - backup) ** 2)
    # 3.
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q_drop - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=actor.variables)
    train_q_op = q_optimizer.minimize(q_loss, var_list=critic.variables)

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(actor.variables +
                                  critic.variables, actor_targ.variables +
                                  critic_targ.variables)
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(actor.variables +
                                  critic.variables, actor_targ.variables +
                                  critic_targ.variables)
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q': q})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        state_noise_scale = 0.01
        o2 += state_noise_scale * np.random.randn(obs_dim)
        # import pdb; pdb.set_trace()
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #19
0
def ppo(workload_file,
        model_path,
        ac_kwargs=dict(),
        seed=0,
        traj_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        pre_trained=0,
        trained_model=None,
        attn=False,
        shuffle=False,
        backfil=False,
        skip=False,
        score_type=0,
        batch_job_slice=0,
        sched_algo=4):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = HPCEnvSkip(shuffle=shuffle,
                     backfil=backfil,
                     skip=skip,
                     job_score_type=score_type,
                     batch_job_slice=batch_job_slice,
                     build_sjf=False,
                     sched_algo=sched_algo)
    env.seed(seed)
    env.my_init(workload_file=workload_file, sched_file=model_path)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['attn'] = attn

    # Inputs to computation graph

    buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE,
                    gamma, lam)

    if pre_trained:
        sess = tf.Session()
        model = restore_tf_graph(sess, trained_model)
        logger.log('load pre-trained model')
        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        x_ph = model['x']
        a_ph = model['a']
        mask_ph = model['mask']
        adv_ph = model['adv']
        ret_ph = model['ret']
        logp_old_ph = model['logp_old_ph']

        pi = model['pi']
        v = model['v']
        # logits = model['logits']
        out = model['out']
        logp = model['logp']
        logp_pi = model['logp_pi']
        pi_loss = model['pi_loss']
        v_loss = model['v_loss']
        approx_ent = model['approx_ent']
        approx_kl = model['approx_kl']
        clipfrac = model['clipfrac']
        clipped = model['clipped']

        # Optimizers
        # graph = tf.get_default_graph()
        # op = sess.graph.get_operations()
        # [print(m.values()) for m in op]
        # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0')
        # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0')
        train_pi = tf.get_collection("train_pi")[0]
        train_v = tf.get_collection("train_v")[0]
        # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad')
        # train_pi = train_pi_optimizer.minimize(pi_loss)
        # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad')
        # train_v = train_v_optimizer.minimize(v_loss)
        # sess.run(tf.variables_initializer(train_pi_optimizer.variables()))
        # sess.run(tf.variables_initializer(train_v_optimizer.variables()))
        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]
        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

    else:
        x_ph, a_ph = placeholders_from_spaces(env.observation_space,
                                              env.action_space)
        # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features
        mask_ph = placeholder(env.action_space.n)
        adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None)

        # Main outputs from computation graph
        pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph,
                                                 **ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]

        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

        # Experience buffer

        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        # PPO objectives
        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                           (1 - clip_ratio) * adv_ph)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
        v_loss = tf.reduce_mean((ret_ph - v)**2)

        # Info (useful to watch during learning)
        approx_kl = tf.reduce_mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute
        approx_ent = tf.reduce_mean(
            -logp)  # a sample estimate for entropy, also easy to compute
        clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio <
                                (1 - clip_ratio))
        clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

        # Optimizers
        train_pi = tf.train.AdamOptimizer(
            learning_rate=pi_lr).minimize(pi_loss)
        train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        tf.add_to_collection("train_pi", train_pi)
        tf.add_to_collection("train_v", train_v)

    # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v})
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph,
                              'adv': adv_ph,
                              'mask': mask_ph,
                              'ret': ret_ph,
                              'logp_old_ph': logp_old_ph
                          },
                          outputs={
                              'pi': pi,
                              'v': v,
                              'out': out,
                              'pi_loss': pi_loss,
                              'logp': logp,
                              'logp_pi': logp_pi,
                              'v_loss': v_loss,
                              'approx_ent': approx_ent,
                              'approx_kl': approx_kl,
                              'clipped': clipped,
                              'clipfrac': clipfrac
                          })

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
    ), 0, False, 0, 0, 0, 0, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for epoch in range(epochs):
        t = 0
        discard_times = 0
        while True:
            # [no_skip, skip]
            lst = [1, 1]
            #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES):
            #    job = o[i:i + JOB_FEATURES]
            #    # the skip time of will_skip job exceeds MAX_SKIP_TIME
            #    if job[-3] == 1.0:
            #        lst = [1,0]

            a, v_t, logp_t, output = sess.run(get_action_ops,
                                              feed_dict={
                                                  x_ph:
                                                  o.reshape(1, -1),
                                                  mask_ph:
                                                  np.array(lst).reshape(1, -1)
                                              })
            # print(a, end=" ")
            '''
            action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs)
            log_action_prob = np.log(action_probs[action])
            '''

            # save and log
            if buf.ptr - buf.path_start_idx >= 10 * JOB_SEQUENCE_SIZE:
                discard_times += 1
                buf.ptr = buf.path_start_idx
                [
                    o, co
                ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
                ), 0, False, 0, 0, 0, 0, 0, 0
                continue
            buf.store(o, None, a, np.array(lst), r, v_t, logp_t)
            logger.store(VVals=v_t)
            if a[0] == 1:
                skip_count += 1
            o, r, d, r2, sjf_t, f1_t = env.step(a[0])
            ep_ret += r
            ep_len += 1
            show_ret += r2
            sjf += sjf_t
            f1 += f1_t

            if d:
                t += 1
                buf.finish_path(r)
                logger.store(EpRet=ep_ret,
                             EpLen=ep_len,
                             ShowRet=show_ret,
                             SJF=sjf,
                             F1=f1,
                             SkipRatio=skip_count / ep_len)
                [
                    o, co
                ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
                ), 0, False, 0, 0, 0, 0, 0, 0
                if t >= traj_per_epoch:
                    # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action)
                    break
        # print("Sample time:", (time.time()-start_time)/num_total, num_total)
        # Save model
        print(discard_times)
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # start_time = time.time()
        update()
        # print("Train time:", time.time()-start_time)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', with_min_and_max=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts',
                           (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('ShowRet', average_only=True)
        logger.log_tabular('SJF', average_only=True)
        logger.log_tabular('F1', average_only=True)
        logger.log_tabular('SkipRatio', with_min_and_max=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #20
0
def sac1_carla(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=3000, epochs=100, replay_size=int(3e5), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=9000,
        max_ep_len=600, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    obs_space = env.observation_space.spaces[0]
    act_space = env.action_space
    obs_dim = obs_space.shape
    act_dim = act_space.shape

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space


    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(False, x_ph, a_ph, **ac_kwargs)
    
    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(False, x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=list(obs_dim), act_dim=list(act_dim), size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/cnn_layer', 'main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t cnn_layer: %d, \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)
    q_backup = r_ph + gamma*(1-d_ph)*v_backup


    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - tf.stop_gradient(q1_pi))
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    cnn_params = get_vars('main/cnn_layer')
    # Policy train op 
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    pi_params = get_vars('main/pi')
    with tf.control_dependencies(update_ops):
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list = cnn_params + pi_params)

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')

    with tf.control_dependencies(update_ops):
        with tf.control_dependencies([train_pi_op]):
                    train_value_op = value_optimizer.minimize(value_loss, var_list = cnn_params + value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies(update_ops):
        with tf.control_dependencies([train_value_op]):
            target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    with tf.control_dependencies(update_ops):
        if isinstance(alpha, Number):
            step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha),
                    train_pi_op, train_value_op, target_update]
        else:
            step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha,
                    train_pi_op, train_value_op, target_update, train_alpha_op]


    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o[np.newaxis,...]})[0]

    def test_agent(n=1):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            if np.random.randn() > 0.1:
                b = (1 + np.random.random(1)) * 0.5
            else:
                b = -1 + 2 * np.random.random(1)
            #b = np.array([1])
            #c = np.array([0])
            c = -1 + 2*np.random.random(1)
            a = np.stack((b, c))
            a = a.flatten()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                is_train = tf.placeholder(tf.bool, name="is_train")
                feed_dict = {}
                feed_dict['is_train'] = True
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                            }

                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                            Q1Vals=outs[3], Q2Vals=outs[4],
                            LogPi=outs[5], Alpha=outs[6])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.

            # test_agent()
            # logger.log_tabular('TestEpLen', average_only=True)
            # logger.log_tabular('TestEpRet', with_min_and_max=True)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #21
0
ファイル: vpg.py プロジェクト: firefly34/implementations
def vpg(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3,
        train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10):
    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    # setup_pytorch_for_mpi()

    # Setup logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random Seed
    seed += 10000 * proc_id
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate Environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create Actor-Critic Module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params()

    # Count the number of variables
    var_counts = tuple(core.count_variables(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experiment buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for calculating VPG policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp']

        # Policy Loss
        pi, logp = ac.pi(obs, act)
        loss_pi = -(logp * adv).mean()

        # Useful extra information
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, ent=ent)

        return loss_pi, pi_info

    # Set up a function for calculating Value Function loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret) ** 2).mean()

        # Set up optimizers for policy and value functions

    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_save(ac)

    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        mpi_avg_grads(ac.pi)  # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)
            vf_optimizer.step()

        # Log changes from the update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    # Prepare for interaction with the environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main Loop: Collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical !)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not terminal:
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True)
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform vpg update
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #22
0
ファイル: sac_rnn.py プロジェクト: zengsh-cqupt/DRL
def sac1(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, lr=1e-4, alpha=0.2, batch_size=150, start_steps=10000,
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['obs_dim'] = obs_dim
    h_size = ac_kwargs["h_size"]  # hidden size of rnn
    seq_length = ac_kwargs["seq"]  # seq length of rnn

    # Inputs to computation graph
    seq = None  # training and testing doesn't has to have the same seq length
    x_ph, a_ph, r_ph, d_ph = core.placeholders([seq, obs_dim], [seq, act_dim], [seq, 1], [seq, 1])
    s_t_0 = tf.placeholder(shape=[None, h_size], name="pre_state", dtype="float32")  # zero state
    # s_0 = np.zeros([batch_size, h_size])  # zero state for training  N H

    # Main outputs from computation graph
    # outputs, states = cudnn_rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"])
    outputs, states = rnn_cell(x_ph, s_t_0, h_size=ac_kwargs["h_size"])
    # states = outputs[:, -1, :]
    # outputs = mlp(outputs, [ac_kwargs["h_size"], ac_kwargs["h_size"]], activation=tf.nn.elu)

    # if use model predict next state (obs)
    with tf.variable_scope("model"):
        """hidden size for mlp
           h_size for RNN
        """
        s_predict = mlp(tf.concat([outputs, a_ph], axis=-1),
                        list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["h_size"]], activation=tf.nn.relu)
        # s_predict = mlp(tf.concat([outputs, a_ph], axis=-1),
        #                 list(ac_kwargs["hidden_sizes"]) + [ac_kwargs["obs_dim"] - act_dim], activation=tf.nn.elu)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, s_t_0, outputs, states,
                                                             **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, q1_pi_, q2_pi_ = actor_critic(x_ph, a_ph, s_t_0, outputs, states,
                                                     **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 h_size=h_size,
                                 seq_length=seq_length,
                                 flag="seq",
                                 normalize=ac_kwargs["norm"])

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', "model"])
    print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t model: %d \n' % var_counts)

    if alpha == 'auto':
        # target_entropy = (-np.prod(env.action_space.shape))
        target_entropy = -np.prod(env.action_space.shape)

        # log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
        # print(ac_kwargs["h0"])
        log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=ac_kwargs["h0"])
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi[:, :-1, :] + target_entropy))
        # Use smaller learning rate to make alpha decay slower
        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=1e-5, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])

    # model train op
    # we can't use s_T to predict s_T+1
    # delta_x = tf.stop_gradient(x_ph[:, 1:, :] - x_ph[:, :-1, :])  # predict delta obs instead of obs
    # TODO: can we use L1 loss
    delta_x = tf.stop_gradient(outputs[:, 1:, :] - outputs[:, :-1, :])  # predict delta obs instead of obs
    model_loss = tf.abs((1 - d_ph[:, :-1, :]) * (s_predict[:, :-1, :] - delta_x))  # how about "done" state
    model_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    # print(tf.global_variables())
    if "m" in ac_kwargs["opt"]:
        value_params_1 = get_vars('model') + get_vars('rnn')
    else:
        value_params_1 = get_vars('model')
    # opt for optimize model
    train_model_op = model_optimizer.minimize(tf.reduce_mean(model_loss), var_list=value_params_1)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(tf.minimum(q1_pi_, q2_pi_) - alpha * logp_pi)
    # clip curiosity
    in_r = tf.stop_gradient(tf.reduce_mean(tf.clip_by_value(model_loss, 0, 64), axis=-1, keepdims=True))
    beta = tf.placeholder(dtype=tf.float32, shape=(), name="beta")
    # beta = ac_kwargs["beta"]  # adjust internal reward
    # can we prove the optimal value of beta
    # I think beta should decrease with training going on
    # beta = alpha  # adjust internal reward
    q_backup = r_ph[:, :-1, :] + beta * in_r + gamma * (1 - d_ph[:, :-1, :]) * v_backup[:, 1:, :]

    # Soft actor-critic losses
    # pi_loss = tf.reduce_mean(alpha * logp_pi[:, :-1, :] - q1_pi[:, :-1, :])
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    # in some case, the last timestep Q function is super important so maybe we can use weight sum of loss
    # calculate last timestep separately for convince
    q1_loss = 0.5 * tf.reduce_mean((q1[:, :-1, :] - q_backup) ** 2)
    q2_loss = 0.5 * tf.reduce_mean((q2[:, :-1, :] - q_backup) ** 2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    # train model first
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    with tf.control_dependencies([train_model_op]):
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    # TODO: maybe we should add parameters in main/rnn to optimizer ---> training is super slow while we adding it
    # TODO: if use model maybe we shouldn't opt rnn with q???
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    if "q" in ac_kwargs["opt"]:
        value_params = get_vars('main/q') + get_vars('rnn')
    else:
        value_params = get_vars('main/q')

    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in non_deterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), model_loss, train_model_op,
                    train_pi_op, train_value_op, target_update]
    else:
        step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, model_loss, train_model_op,
                    train_pi_op, train_value_op, target_update, train_alpha_op]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                          outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})

    def get_action(o, s_t_0_, mu, pi, states, deterministic=False):
        """s_t_0_  starting step for testing 1 H"""

        act_op = mu if deterministic else pi
        action, s_t_1_ = sess.run([act_op, states], feed_dict={x_ph: o.reshape(1, 1, obs_dim),
                                                               a_ph: np.zeros([1, 1, act_dim]),
                                                               s_t_0: s_t_0_})
        return action.reshape(act_dim), s_t_1_

    def test_agent(mu, pi, states, n=5):
        # global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            s_0 = np.zeros([1, h_size])
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a, s_1 = get_action(o, s_0, mu, pi, states, deterministic=True)
                s_0 = s_1
                o, r, d, _ = test_env.step(a)
                # test_env.render()
                ep_ret += r
                ep_len += 1
                # replay_buffer.store(o.reshape([1, obs_dim]), a.reshape([1, act_dim]), r, d)
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    # start = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    s_t_0_ = np.zeros([1, h_size])
    episode = 0

    for t in range(total_steps + 1):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t == 0:
            start = time.time()

        if t > start_steps:
            # s_t_0_store = s_t_0_    # hidden state stored in buffer
            a, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False)
            s_t_0_ = s_t_1_
        else:
            # s_t_0_store = s_t_0_
            # print(s_t_0_.shape)
            _, s_t_1_ = get_action(o, s_t_0_, mu, pi, states, deterministic=False)
            s_t_0_ = s_t_1_
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)  # give back o_t_1 we need store o_t_0 because that is what cause a_t_0
        # print(r)
        # env.render()
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o.reshape([1, obs_dim]), s_t_0_.reshape([1, h_size]), a.reshape([1, act_dim]), r, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            # fps = (time.time() - start)/200
            # print("{} fps".format(200 / (time.time() - start)))
            print(ep_len)
            episode += 1
            start = time.time()
            beta_ = ac_kwargs["beta"] * (1 - t / total_steps)
            # beta_ = ac_kwargs["beta"] * (1 / t ** 0.5)
            for j in range(int(ep_len)):
                batch = replay_buffer.sample_batch(batch_size)
                # maybe we can store starting state
                feed_dict = {x_ph: batch['obs1'],
                             s_t_0: batch['s_t_0'],  # all zero matrix for zero state in training
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             beta: beta_,
                             }
                for _ in range(ac_kwargs["tm"] - 1):
                    batch = replay_buffer.sample_batch(batch_size)
                    # maybe we can store starting state
                    feed_dict = {x_ph: batch['obs1'],
                                 s_t_0: batch['s_t_0'],  # stored zero state for training
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done'],
                                 beta: beta_,
                                 }
                    _ = sess.run(train_model_op, feed_dict)
                outs = sess.run(step_ops, feed_dict)
                # print(outs)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3].flatten(),
                             Q2Vals=outs[4].flatten(),
                             LogPi=outs[5].flatten(),
                             Alpha=outs[6],
                             beta=beta_,
                             model_loss=outs[7].flatten())

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            s_t_0_ = np.zeros([1, h_size])  # reset s_t_0_ when one episode is finished
            print("one episode duration:", time.time() - start)
            start = time.time()

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            # if (epoch % save_freq == 0) or (epoch == epochs - 1):
            #     logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(mu, pi, states)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('Episode', episode)
            logger.log_tabular('name', name)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('beta', average_only=True)
            logger.log_tabular('model_loss', with_min_and_max=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #23
0
def sac(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
        update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
        logger_kwargs=dict(), save_freq=1, use_gpu=False, learnable_temperature=False):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # env, test_env = env_fn(), env_fn()
    env = env_fn()
    action_space = env.action_space
    action_space.seed(seed)
    act_dim = action_space.shape[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)
    target_entropy = -act_dim

    print (ac)
    device = None
    if use_gpu:
        device = 'cuda'
        ac.cuda()
        ac_targ.cuda()

    log_alpha = torch.tensor(np.log(alpha), requires_grad=True, device=next(ac.parameters()).device)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_space=env.observation_space, act_dim=act_dim, size=replay_size, device=device)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(helper.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done']

        q1 = ac.q1(o,a)
        q2 = ac.q2(o,a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - log_alpha.exp().detach() * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().cpu().numpy(),
                      Q2Vals=q2.detach().cpu().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (log_alpha.exp().detach() * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().cpu().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    log_alpha_optimizer = Adam([log_alpha], lr=lr)
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr, weight_decay=1e-6)
    q_optimizer = Adam(q_params, lr=lr, weight_decay=1e-6)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort 
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        if learnable_temperature:
            log_alpha_optimizer.zero_grad()
            log_prob = pi_info["LogPi"]
            alpha_loss = log_alpha.exp() * ((-log_prob - target_entropy)).mean()
            logger.store(Alpha=log_alpha.exp(), AlphaLoss=alpha_loss)
            alpha_loss.backward()
            log_alpha_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch_ext.as_tensor(o, dtype=torch.float32, device=device), 
                      deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards, 
        # use the learned policy. 
        if t > start_steps:
            a = get_action(o)
        else:
            a = action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        # env.render(mode='human')
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # if t % 100 == 0: print ("t: ", t)

        # End of epoch handling
        if (t+1) % steps_per_epoch == 0 and t >= update_after:
            epoch = (t+1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, itr=epoch)

            # Test the performance of the deterministic version of the agent.
            # test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            # logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', with_min_and_max=True)
            # logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('AlphaLoss', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
コード例 #24
0
#!/usr/bin/env python3
import gym
import numpy as np
from spinup.utils.logx import EpochLogger

env = gym.make('gym_lgsvl:lgsvl-v0')

observation = env.reset()
epoch_logger = EpochLogger()

for i_episode in range(20):
  observation = env.reset()
  while (True):   
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    epoch_logger.store(Reward = reward)
    if done:
        epoch_logger.log_tabular('Reward', with_min_and_max=True)
        epoch_logger.dump_tabular()
        break
コード例 #25
0
def trpo(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=0.99,
         delta=0.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=0.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=0.8,
         lam=0.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo='trpo'):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    # initialize logger and save it
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # initialize seed, and set tf and np
    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # get the env function, observation dimensions, and action dimensions
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph, plus placeholders for old pdist (for KL)
    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph
               ] + core.values_as_sorted_list(info_phs)

    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
    get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info)

    # Experience buffer
    # calculate the number of steps per epoch per process
    local_steps_per_epoch = int(steps_per_epoch / num_procs())

    # get the info shapes
    info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()}

    # initialize the bugger
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # TRPO losses
    # ratio of pi / pi_old
    # pi loss
    # v loss
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # Symbols needed for CG solver
    # pi params
    # gradient
    # v_ph and hvp
    pi_params = core.get_vars('pi')
    gradient = core.flat_grad(pi_loss, pi_params)
    v_ph, hvp = core.hessian_vector_product(d_kl, pi_params)

    # check if the damping coeff is needed
    # if so, update hvp (damping_coeff * v_ph)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    # get pi params
    # set pi params
    get_pi_params = core.flat_concat(pi_params)
    set_pi_params = core.assign_params_from_flat(v_ph, pi_params)

    # create a tf session and initialize it's variables
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """

        # initialize x as 0s of shape b
        x = np.zeros_like(b)

        # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        # make a copy of b and r as r and p
        r = b.copy()
        p = r.copy()

        # calculate r dot old (r dot r)
        r_dot_old = np.dot(r, r)

        # for cg_iterations
        for _ in range(cg_iters):

            # calc z as Ax(p)
            z = Ax(p)

            # calculate alpha
            alpha = r_dot_old / (np.dot(p, z) + EPS)

            # increment x
            x += alpha * p

            # decrement r
            r -= alpha * z

            # calculate r dot new (r dot r)
            r_dot_new = np.dot(r, r)

            # calculate p
            p = r + (r_dot_new / r_dot_old) * p

            # update r dot old with r dot new
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        # get inputs as a dictionary, all phs and buffer
        inputs = {k: v for k, v in zip(all_phs, buf.get())}

        # calculate Hx
        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))

        # get g, pi_l_old, v_l_old
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)

        # get g and pi_l_old averages
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        # get x
        x = cg(Hx, g)

        # get alpha
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))

        # get old paramers
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            # set pi params with v_ph
            # old_params - alpha * x * step
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})

            # return average of d_kl and pi_loss operation
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo == 'trpo':
            # trpo augments npg with backtracking line search, hard kl
            # for backtrack iterations
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        # for train_v_iterations
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)

        # update v_l_new with v_loss operation
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    # Update start time
    start_time = time.time()

    # reset variables
    # o, r, d, ep_ret, ep_len
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            # get agent outputs
            agent_outs = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})

            # decontruct the above to a, v_t, logp_t, info_t
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[
                1], agent_outs[2], agent_outs[3:]

            # save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            # take an action
            o, r, d, _ = env.step(a)

            # update ep rewards and length
            ep_ret += r
            ep_len += 1

            # check if the episode is done
            terminal = d or (ep_len == max_ep_len)

            # check if terminal or at max t for local epoch
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})

                # add the finish path to buffer
                buf.finish_path(last_val)

                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)

                # reset environment variables
                # o, r, d, ep_ret, ep_len
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
コード例 #26
0
ファイル: main_prog_bail.py プロジェクト: watchernyu/BAIL
def bail_learn(algo = 'bail_2_bah',
			   env_set="Hopper-v2", seed=0, buffer_type='FinalSigma0.5_env_0_1000K',
			   gamma=0.99, ue_rollout=1000, augment_mc='gain', C=None,
			   eval_freq=625, max_timesteps=int(25e4), batch_size=1000,
			   lr=1e-3, wd=0, ue_lr=3e-3, ue_wd=2e-2, ue_loss_k=1000, ue_vali_freq=1250,
			   pct_anneal_type='constant', last_pct=0.25,
			   select_type='border',
			   logger_kwargs=dict()):

	"""set up logger"""
	global logger
	logger = EpochLogger(**logger_kwargs)
	logger.save_config(locals())

	if not os.path.exists("./plots"):
		os.makedirs("./plots")
	if not os.path.exists("./pytorch_models"):
		os.makedirs("./pytorch_models")

	file_name = "%s_%s_%s" % (algo, env_set, seed)
	setting_name = "%s_r%s_g%s" % (buffer_type.replace('env', env_set), ue_rollout, gamma)
	setting_name += '_noaug' if not (augment_mc) else ''
	setting_name += '_augNew' if augment_mc == 'new' else ''

	print("---------------------------------------")
	print("Algo: " + file_name + "\tData: " + buffer_type)
	print("Settings: " + setting_name)
	print("Evaluate Policy every", eval_freq * batch_size * 0.8 / 1e6,
		  'epoches; Total', max_timesteps * batch_size * 0.8 / 1e6, 'epoches')
	print("---------------------------------------")

	env = gym.make(env_set)
	test_env = gym.make(env_set)

	# Set seeds
	env.seed(seed)
	test_env.seed(seed)
	env.action_space.np_random.seed(seed)
	test_env.action_space.np_random.seed(seed)
	torch.manual_seed(seed)
	np.random.seed(seed)

	state_dim = env.observation_space.shape[0]
	action_dim = env.action_space.shape[0]
	max_action = float(env.action_space.high[0])

	# Load buffer
	replay_buffer = utils.ReplayBuffer()
	buffer_name = buffer_type.replace('env', env_set)
	replay_buffer.load(buffer_name)

	# Load data for training UE
	states = np.load('./results/ueMC_%s_S.npy' % buffer_name, allow_pickle=True).squeeze()

	setting_name += '_Gain' if augment_mc == 'gain' else '_Gt'
	gts = np.load('./results/ueMC_%s.npy' % setting_name, allow_pickle=True).squeeze()
	print('Load mc returns type', augment_mc, 'with gamma:', gamma, 'rollout length:', ue_rollout)

	# Start training
	print('-- Policy train starts --')
	# Initialize policy
	if algo == 'bail_2_bah':
		policy = bail_training.BAIL_selebah(state_dim, action_dim, max_action, max_iters=max_timesteps, States=states, MCrets=gts,
								ue_lr=ue_lr, ue_wd=ue_wd,
								pct_anneal_type=pct_anneal_type, last_pct=last_pct, pct_info_dic=pct_info_dic,
								select_type=select_type, C=C)
	elif algo == 'bail_1_buf':
		policy = bail_training.BAIL_selebuf(state_dim, action_dim, max_action, max_iters=max_timesteps,
										States=states, MCrets=gts,
										ue_lr=ue_lr, ue_wd=ue_wd,
										pct_anneal_type=pct_anneal_type, last_pct=last_pct, pct_info_dic=pct_info_dic,
										select_type=select_type, C=C)
	else:
		raise Exception("! undefined BAIL implementation '%s'" % algo)

	training_iters, epoch = 0, 0
	
	while training_iters < max_timesteps:
		epoch += eval_freq * batch_size * 0.8 / 1e6
		ue = policy.train(replay_buffer, training_iters, iterations=eval_freq, batch_size=batch_size,
								ue_loss_k=ue_loss_k,  ue_vali_freq=ue_vali_freq,
								logger=logger)

		if training_iters >= max_timesteps - eval_freq:
			cur_ue_setting = 'Prog_' + setting_name + '_lossk%s_s%s' % (ue_loss_k, seed)
			bail_training.plot_envelope(ue, states, gts, cur_ue_setting, seed, [ue_lr, ue_wd, ue_loss_k, max_timesteps/batch_size, 4])
			torch.save(ue.state_dict(), '%s/Prog_UE_%s.pth' % ("./pytorch_models", setting_name + \
																  '_s%s_lok%s' % (seed, ue_loss_k)))

		avgtest_reward = evaluate_policy(policy, test_env)
		training_iters += eval_freq

		# log training info
		logger.log_tabular('Epoch', epoch)
		logger.log_tabular('AverageTestEpRet', avgtest_reward)
		logger.log_tabular('TotalSteps', training_iters)
		logger.log_tabular('CloneLoss', average_only=True)
		logger.log_tabular('UELoss', average_only=True)
		logger.log_tabular('BatchUEtrnSize', average_only=True)
		logger.log_tabular('SVal', with_min_and_max=True)
		logger.log_tabular('SelePct', average_only=True)
		logger.log_tabular('BatchUpSize', with_min_and_max=True)
		logger.log_tabular('UEValiLossMin', average_only=True)
		if select_type == 'border':
			logger.log_tabular('Border', with_min_and_max=True)
		elif select_type == 'margin':
			logger.log_tabular('Margin', with_min_and_max=True)
		else:
			raise Exception('! undefined selection type')

		logger.dump_tabular()
コード例 #27
0
ファイル: sqn_rpf.py プロジェクト: peteroxic/DRL
def sqn_rpf(env_fn,
            actor_critic=core.mlp_actor_critic,
            ac_kwargs=dict(),
            seed=0,
            steps_per_epoch=5000,
            epochs=100,
            replay_size=int(1e6),
            gamma=0.99,
            polyak=0.995,
            lr=1e-3,
            alpha=0.2,
            batch_size=100,
            start_steps=10000,
            max_ep_len=1000,
            logger_kwargs=dict(),
            save_freq=1,
            ensemble_size=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    # print(max_ep_len,type(max_ep_len))
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    obs_space = env.observation_space
    act_dim = env.action_space.n
    act_space = env.action_space

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(
        obs_space, act_space, obs_space, None, None)
    # x_ph, x2_ph: shape(?,128)
    # a_ph: shape(?,1)
    # r_ph, d_ph: shape(?,)
    all_ph = [x_ph, a_ph, x2_ph, r_ph, d_ph]

    ######
    if alpha == 'auto':
        # target_entropy = (-np.prod(env.action_space.n))
        # target_entropy = (np.prod(env.action_space.n))/4/10
        target_entropy = 0.15

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)
        alpha = tf.exp(log_alpha)
    ######

    # Main outputs from computation graph
    with tf.variable_scope('random_head'):
        head_index = tf.get_variable(name='random_int',
                                     shape=[],
                                     dtype=tf.int32)

    with tf.variable_scope('main'):
        mu, pi, _, q1, _, q2, _ = actor_critic(x_ph,
                                               a_ph,
                                               alpha,
                                               ensemble_size=ensemble_size,
                                               **ac_kwargs)
        # _, _, logp_pi, _, _ = actor_critic(x2_ph, a_ph, alpha, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, q1_pi_, _, q2_pi_ = actor_critic(
            x2_ph, a_ph, alpha, ensemble_size=ensemble_size, **ac_kwargs)

    # Experience buffer
    if isinstance(act_space, Box):
        a_dim = act_dim
    elif isinstance(act_space, Discrete):
        a_dim = 1
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=a_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

    ######
    if isinstance(alpha, tf.Tensor):
        alpha_loss = tf.reduce_mean(
            -log_alpha * tf.stop_gradient(logp_pi_ + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])


######

# Min Double-Q:
    min_q_pi = [tf.minimum(q1_pi_[i], q2_pi_[i]) for i in range(ensemble_size)]

    # Targets for Q and V regression
    # v_backup = tf.stop_gradient(q1_pi_ - alpha * logp_pi_)  ############################## alpha=0
    v_backup = [
        tf.stop_gradient(min_q_pi[i] - alpha * logp_pi_[i])
        for i in range(ensemble_size)
    ]
    # q_backup = tf.expand_dims(r_ph, axis=-1) + gamma*(1-tf.expand_dims(d_ph, axis=-1))*v_backup
    # q_backup = r_ph + gamma * (1 - d_ph) * v_backup
    q_backup = [
        r_ph + gamma * (1 - d_ph) * v_backup[i] for i in range(ensemble_size)
    ]

    # Soft actor-critic losses
    # q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    # q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    # value_loss = q1_loss + q2_loss
    q1_loss = [
        0.5 * tf.reduce_mean((q_backup[i] - q1[i])**2, axis=0)
        for i in range(ensemble_size)
    ]
    q2_loss = [
        0.5 * tf.reduce_mean((q_backup[i] - q2[i])**2, axis=0)
        for i in range(ensemble_size)
    ]
    value_loss = [q1_loss[i] + q2_loss[i] for i in range(ensemble_size)]

    # # Policy train op
    # # (has to be separate from value train op, because q1_pi appears in pi_loss)
    # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    #with tf.control_dependencies([train_pi_op]):
    train_value_op = [
        value_optimizer.minimize(value_loss[i], var_list=value_params)
        for i in range(ensemble_size)
    ]
    # train_value_op = [value_optimizer.minimize(value_loss)]

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies(train_value_op):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])  # zip([1,2,3,4],['a','b']) = [(1,'a'),(2,'b')]

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            q1_loss[0], q1[0], logp_pi_[0],
            tf.identity(alpha), train_value_op, target_update
        ]
        # step_ops = [q1_loss[0], q1[0], logp_pi_[0], tf.identity(alpha), target_update]
    else:
        step_ops = [
            q1_loss, q1, logp_pi_, alpha, train_value_op, target_update,
            train_alpha_op
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu[0],
                              'pi': pi[0],
                              'q1': q1[0]
                          })

    def get_action(o, active_head=0, deterministic=False):
        act_op = mu[active_head] if deterministic else pi[active_head]
        return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0]

    def test_agent(n=3):  # number of tests
        global sess, mu, pi, q1
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):  # max_ep_len
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, deterministic=True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Select a head to interact with env.
    active_head = np.random.randint(ensemble_size)

    # t0 = time.time()

    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """

        # if t > start_steps and 20*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum
        if t > start_steps:
            a = get_action(o, active_head=active_head)
        else:
            a = env.action_space.sample()

        np.random.random()

        # Step the env
        o2, r, d, _ = env.step(a)
        #print(a,o2)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):

            # t_last = t0
            # t0 = time.time()
            # print('episode_time:', t0-t_last, 'ep_len:', ep_len)
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                # step_ops = [q1_loss, q1, logp_pi_, alpha, train_value_op, target_update, train_alpha_op]

                # for i in range(ensemble_size):
                #     batch = replay_buffer.sample_batch(batch_size)
                #     feed_dict = {x_ph: batch['obs1'],
                #                  x2_ph: batch['obs2'],
                #                  a_ph: batch['acts'],
                #                  r_ph: batch['rews'],
                #                  d_ph: batch['done'],
                #                 }
                #     # step_ops = [q1_loss, q1, logp_pi_, alpha, target_update, train_alpha_op]
                #     q_values = sess.make_callable(train_value_op, [o_tm1])
                #     sess.run(train_value_op[i], feed_dict)
                #     #print(i)

                outs = sess.run(step_ops, feed_dict)
                logger.store(LossQ1=outs[0],
                             Q1Vals=outs[1],
                             LogPi=outs[2],
                             Alpha=outs[3])

            logger.store(EpRet=ep_ret, EpLen=ep_len)

            # t_last = t0
            # t0 = time.time()
            # print('training_time:', t0-t_last, 'num_train/ep_len:', ep_len)

            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            # Select a head to interact with env.
            active_head = np.random.randint(ensemble_size)
            # print(active_head)

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:  # and ep_len < steps_per_epoch:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            # logger.log_tabular('Q2Vals', with_min_and_max=True)
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            # logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            # logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #28
0
def sac(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=500,
        epochs=100000,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-4,
        alpha=0.2,
        batch_size=100,
        start_epochs=1000,
        max_ep_len=500,
        policy_path=None,
        logger_kwargs=dict(),
        save_freq=100,
        update_steps=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. 
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    test_logger_kwargs = dict()
    test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'],
                                                "test")
    test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    test_logger = EpochLogger(**test_logger_kwargs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['output_activation'] = None

    if policy_path is None:
        # Inputs to computation graph
        x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(
            obs_dim, act_dim, obs_dim, None, None)

        # Main outputs from computation graph
        with tf.variable_scope('main'):
            mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(
                x_ph, a_ph, **ac_kwargs)

        # Target value network
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph,
                                                       **ac_kwargs)

    else:
        # todo
        # load pretrained model
        with tf.variable_scope('main'):
            sess, x_ph, a_ph, mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = load_policy(
                policy_path,
                itr='last',
                deterministic=False,
                act_high=env.action_space.high)

        x2_ph, r_ph, d_ph = core.placeholders(None, None, None)

        # Target value network
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph,
                                                       **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)

    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    sess.run(tf.variables_initializer(pi_optimizer.variables()))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)
        sess.run(tf.variables_initializer(value_optimizer.variables()))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # sess = tf.Session()
    # sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    def test_agent(n=91, test_num=1):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        env.unwrapped._set_test_mode(True)
        for i in range(n):
            observation = env.reset()
            policy_cumulated_reward = 0
            for t in range(episode_steps):
                newObservation, reward, done, info = env.step(
                    get_action(np.array(observation), True))
                observation = newObservation
                if (t == episode_steps - 1):
                    print("reached the end")
                    done = True
                policy_cumulated_reward += reward

                if done:
                    test_logger.store(policy_reward=policy_cumulated_reward)
                    test_logger.store(policy_steps=t)
                    test_logger.store(arrive_des=info['arrive_des'])
                    break
                else:
                    pass
        test_logger.log_tabular('epoch', epoch)
        test_logger.log_tabular('policy_reward', average_only=True)
        test_logger.log_tabular('policy_steps', average_only=True)
        test_logger.log_tabular('arrive_des', average_only=True)
        test_logger.dump_tabular()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    test_num = 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        # o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        for t in range(steps_per_epoch):
            if epoch > start_epochs:
                a = get_action(np.array(o))
            else:
                a = env.action_space.sample()

            # Step the env
            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1
            if (t == steps_per_epoch - 1):
                print("reached the end")
                d = True

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len==max_ep_len else d

            # Store experience to replay buffer
            replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            if d:
                """
                Perform all SAC updates at the end of the trajectory.
                This is a slight difference from the SAC specified in the
                original paper.
                """
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done'],
                    }
                    outs = sess.run(step_ops, feed_dict)
                    logger.store(LossPi=outs[0],
                                 LossQ1=outs[1],
                                 LossQ2=outs[2],
                                 LossV=outs[3],
                                 Q1Vals=outs[4],
                                 Q2Vals=outs[5],
                                 VVals=outs[6],
                                 LogPi=outs[7])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                break

        # End of epoch wrap-up
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs - 1):
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
コード例 #29
0
class AntSoftActorCritic:
    def var_scope(self, var):
        return self.main_scope + '/' + var

    def __init__(self,
                 env_fn,
                 reward_fn=[],
                 actor_critic=core.mlp_actor_critic,
                 xid=0,
                 seed=0,
                 max_ep_len=1000,
                 gamma=.99,
                 alpha=0.2,
                 lr=1e-3,
                 polyak=0.995,
                 replay_size=int(1e6),
                 ac_kwargs=dict(),
                 logger_kwargs=dict(),
                 normalization_factors=[],
                 learn_reduced=False):

        tf.set_random_seed(seed)
        np.random.seed(seed)

        self.xid = xid
        self.main_scope = 'main' + str(xid)
        self.target_scope = 'target' + str(xid)

        self.logger = EpochLogger(**logger_kwargs)
        self.logger.save_config(logger_kwargs)

        self.max_ep_len = max_ep_len
        self.reward_fn = reward_fn
        self.normalization_factors = normalization_factors
        self.learn_reduced = learn_reduced

        self.env, self.test_env = env_fn(), env_fn()
        self.obs_dim = len(self.env.env.state_vector())
        if self.learn_reduced:
            self.obs_dim = ant_utils.expected_state_dim
        self.act_dim = self.env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        self.act_limit = self.env.action_space.high[0]

        # Share information about action space with policy architecture
        ac_kwargs['action_space'] = self.env.action_space

        self.graph = tf.Graph()
        with self.graph.as_default():
            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(
                self.obs_dim, self.act_dim, self.obs_dim, None, None)

            # Main outputs from computation graph
            with tf.variable_scope(self.main_scope):
                self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v, self.std = actor_critic(
                    self.x_ph, self.a_ph, **ac_kwargs)

            # Target value network
            with tf.variable_scope(self.target_scope):
                _, _, _, _, _, _, _, self.v_targ, _ = actor_critic(
                    self.x2_ph, self.a_ph, **ac_kwargs)

            # Experience buffer
            self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim,
                                              act_dim=self.act_dim,
                                              size=replay_size)

            # Min Double-Q:
            min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)

            # Targets for Q and V regression
            q_backup = tf.stop_gradient(self.r_ph + gamma *
                                        (1 - self.d_ph) * self.v_targ)
            v_backup = tf.stop_gradient(min_q_pi - alpha * self.logp_pi)

            # Soft actor-critic losses
            pi_loss = tf.reduce_mean(alpha * self.logp_pi - self.q1_pi)
            q1_loss = 0.5 * tf.reduce_mean((q_backup - self.q1)**2)
            q2_loss = 0.5 * tf.reduce_mean((q_backup - self.q2)**2)
            v_loss = 0.5 * tf.reduce_mean((v_backup - self.v)**2)
            value_loss = q1_loss + q2_loss + v_loss

            # Policy train op
            # (has to be separate from value train op, because q1_pi appears in pi_loss)
            pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            train_pi_op = pi_optimizer.minimize(pi_loss,
                                                var_list=get_vars(
                                                    self.var_scope('pi')))

            # Value train op
            # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
            value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            value_params = get_vars(self.var_scope('q')) + get_vars(
                self.var_scope('v'))
            with tf.control_dependencies([train_pi_op]):
                train_value_op = value_optimizer.minimize(
                    value_loss, var_list=value_params)

            # Polyak averaging for target variables
            # (control flow because sess.run otherwise evaluates in nondeterministic order)
            with tf.control_dependencies([train_value_op]):
                target_update = tf.group([
                    tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                    for v_main, v_targ in zip(get_vars(self.main_scope),
                                              get_vars(self.target_scope))
                ])

            # All ops to call during one training step
            self.step_ops = [
                pi_loss, q1_loss, q2_loss, v_loss, self.q1, self.q2, self.v,
                self.logp_pi, train_pi_op, train_value_op, target_update
            ]

            # Initializing targets to match main variables
            target_init = tf.group([
                tf.assign(self.v_targ, v_main) for v_main, self.v_targ in zip(
                    get_vars(self.main_scope), get_vars(self.target_scope))
            ])

            self.sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=False))
            self.sess.run(tf.global_variables_initializer())
            self.sess.run(target_init)

    def reward(self, env, r, o):
        if len(self.reward_fn) == 0:
            return r

        # use self.normalization_factors to normalize the state.
        tup = tuple(
            ant_utils.discretize_state(o, self.normalization_factors, env))
        return self.reward_fn[tup]

    def get_action(self, o, deterministic=False):
        if self.learn_reduced:
            o = ant_utils.convert_obs(o)
        with self.graph.as_default():
            act_op = self.mu if deterministic else self.pi
            action = self.sess.run(act_op,
                                   feed_dict={self.x_ph: o.reshape(1, -1)})[0]
            return action

    def get_sigma(self, o):
        if self.learn_reduced:
            o = ant_utils.convert_obs(o)
        with self.graph.as_default():
            return self.sess.run(self.std,
                                 feed_dict={self.x_ph: o.reshape(1, -1)})[0]

    def test_agent(self,
                   T,
                   n=10,
                   initial_state=[],
                   normalization_factors=[],
                   store_log=True,
                   deterministic=True,
                   reset=False):

        denom = 0

        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))

        for j in range(n):
            o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0

            if len(initial_state) > 0:
                qpos = initial_state[:len(ant_utils.qpos)]
                qvel = initial_state[len(ant_utils.qpos):]
                self.test_env.env.set_state(qpos, qvel)
                o = self.test_env.env._get_obs()

            o = get_state(self.test_env, o)
            while not (d or (ep_len == T)):
                # Take deterministic actions at test time
                a = self.get_action(o, deterministic)
                o, r, d, _ = self.test_env.step(a)
                o = get_state(self.test_env, o)

                r = self.reward(self.test_env, r, o)
                ep_ret += r
                ep_len += 1
                denom += 1

                p[tuple(
                    ant_utils.discretize_state(o, normalization_factors,
                                               self.test_env))] += 1
                p_xy[tuple(
                    ant_utils.discretize_state_2d(o, normalization_factors,
                                                  self.test_env))] += 1

                if d and reset:
                    d = False

            if store_log:
                self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

        p /= float(denom)
        p_xy /= float(denom)
        return p, p_xy

    def test_agent_random(self, T, normalization_factors=[], n=10):

        p = np.zeros(shape=(tuple(ant_utils.num_states)))
        p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d)))

        cumulative_states_visited_baseline = 0
        states_visited_baseline = []
        cumulative_states_visited_xy_baseline = 0
        states_visited_xy_baseline = []

        denom = 0

        for j in range(n):
            o, r, d, ep_ret, ep_len = self.test_env.reset(), 0, False, 0, 0
            o = get_state(self.test_env, o)
            while not (d or (ep_len == T)):
                a = self.test_env.action_space.sample()
                o, r, d, _ = self.test_env.step(a)
                o = get_state(self.test_env, o)
                r = self.reward(self.test_env, r, o)

                # if this is the first time you are seeing this state, increment.
                if p[tuple(
                        ant_utils.discretize_state(o, normalization_factors,
                                                   self.test_env))] == 0:
                    cumulative_states_visited_baseline += 1
                states_visited_baseline.append(
                    cumulative_states_visited_baseline)
                if p_xy[tuple(
                        ant_utils.discretize_state_2d(o, normalization_factors,
                                                      self.test_env))] == 0:
                    cumulative_states_visited_xy_baseline += 1
                states_visited_xy_baseline.append(
                    cumulative_states_visited_xy_baseline)

                p[tuple(
                    ant_utils.discretize_state(o, normalization_factors,
                                               self.test_env))] += 1
                p_xy[tuple(
                    ant_utils.discretize_state_2d(o, normalization_factors,
                                                  self.test_env))] += 1

                denom += 1
                ep_len += 1

                if d:  # CRITICAL: ignore done signal
                    d = False

        p /= float(denom)
        p_xy /= float(denom)

        return p, p_xy, states_visited_baseline, states_visited_xy_baseline

    # record film of policy
    def record(self, T, n, video_dir='', on_policy=False, deterministic=False):
        print("rendering env in record()")

        # TODO: set width and height.

        for i in range(n):
            self.test_env.reset()
            wrapped_env = wrappers.Monitor(self.test_env,
                                           video_dir + '_%d' % (i))
            o = wrapped_env.reset()

            t = 0
            d = False
            while t < T and not d:
                o = wrapped_env.unwrapped.state_vector()
                if on_policy:
                    a = self.get_action(o, deterministic)
                else:
                    a = wrapped_env.unwrapped.action_space.sample()
                o2, r, d, _ = wrapped_env.step(a)
                print(t)

                if np.all(np.isclose(o, wrapped_env.unwrapped.state_vector())):
                    print('close!')
                    break

                wrapped_env.unwrapped.render(mode='rgb_array',
                                             width=1000,
                                             height=1000)

                o = o2
                t = t + 1
            wrapped_env.close()

    def soft_actor_critic(self,
                          initial_state=[],
                          steps_per_epoch=5000,
                          epochs=100,
                          batch_size=100,
                          start_steps=10000,
                          save_freq=1):

        with self.graph.as_default():

            # Count variables
            var_counts = tuple(
                core.count_vars(scope) for scope in [
                    self.var_scope('pi'),
                    self.var_scope('q1'),
                    self.var_scope('q2'),
                    self.var_scope('v'), self.main_scope
                ])
            print(('\nNumber of parameters: \t pi: %d, \t' + \
                   'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

            # Setup model saving
            self.logger.setup_tf_saver(self.sess,
                                       inputs={
                                           'x': self.x_ph,
                                           'a': self.a_ph
                                       },
                                       outputs={
                                           'mu': self.mu,
                                           'pi': self.pi,
                                           'q1': self.q1,
                                           'q2': self.q2,
                                           'v': self.v
                                       })

            start_time = time.time()
            o, r, d, ep_ret, ep_len = self.env.reset(), 0, False, 0, 0
            if len(initial_state) > 0:
                qpos = initial_state[:len(ant_utils.qpos)]
                qvel = initial_state[len(ant_utils.qpos):]
                self.env.env.set_state(qpos, qvel)
                o = self.env.env._get_obs()

            o = get_state(self.env, o)

            total_steps = steps_per_epoch * epochs

            # Main loop: collect experience in env and update/log each epoch
            for t in range(total_steps):
                """
                Until start_steps have elapsed, randomly sample actions
                from a uniform distribution for better exploration. Afterwards, 
                use the learned policy. 
                """
                if t > start_steps:
                    # if t == start_steps + 1:
                    #     print("!!!! using policy !!!!")
                    a = self.get_action(o)
                else:
                    a = self.env.action_space.sample()

                # Step the env
                o2, r, d, _ = self.env.step(a)
                o2 = get_state(self.env, o2)
                r = self.reward(self.env, r, o2)

                ep_ret += r
                ep_len += 1

                # Ignore the "done" signal if it comes from hitting the time
                # horizon (that is, when it's an artificial terminal signal
                # that isn't based on the agent's state)
                d = False if ep_len == self.max_ep_len else d

                # Store experience to replay buffer
                if self.learn_reduced:
                    self.replay_buffer.store(ant_utils.convert_obs(o), a, r,
                                             ant_utils.convert_obs(o2), d)
                else:
                    self.replay_buffer.store(o, a, r, o2, d)

                # Super critical: update most recent observation.
                o = o2

                if d or (ep_len == self.max_ep_len):
                    """
                    Perform all SAC updates at the end of the trajectory.
                    This is a slight difference from the SAC specified in the
                    original paper.
                    """
                    for j in range(ep_len):
                        batch = self.replay_buffer.sample_batch(batch_size)
                        feed_dict = {
                            self.x_ph: batch['obs1'],
                            self.x2_ph: batch['obs2'],
                            self.a_ph: batch['acts'],
                            self.r_ph: batch['rews'],
                            self.d_ph: batch['done'],
                        }
                        outs = self.sess.run(self.step_ops, feed_dict)
                        self.logger.store(LossPi=outs[0],
                                          LossQ1=outs[1],
                                          LossQ2=outs[2],
                                          LossV=outs[3],
                                          Q1Vals=outs[4],
                                          Q2Vals=outs[5],
                                          VVals=outs[6],
                                          LogPi=outs[7])

                    self.logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, r, d, ep_ret, ep_len = self.env.reset(), 0, False, 0, 0
                    if len(initial_state) > 0:
                        qpos = initial_state[:len(ant_utils.qpos)]
                        qvel = initial_state[len(ant_utils.qpos):]
                        self.env.env.set_state(qpos, qvel)
                        o = self.env.env._get_obs()
                    o = get_state(self.env, o)

                # End of epoch wrap-up
                if t > 0 and t % steps_per_epoch == 0:
                    epoch = t // steps_per_epoch
                    # Save model
                    if (epoch % save_freq == 0) or (epoch == epochs - 1):
                        self.logger.save_state({'env': self.env}, None)

                    # Test the performance of the deterministic version of the agent.
                    self.test_agent(self.max_ep_len)

                    # Log info about epoch
                    self.logger.log_tabular('Epoch', epoch)
                    self.logger.log_tabular('EpRet', with_min_and_max=False)
                    self.logger.log_tabular('TestEpRet',
                                            with_min_and_max=False)
                    self.logger.log_tabular('EpLen', average_only=True)
                    self.logger.log_tabular('TestEpLen', average_only=True)
                    self.logger.log_tabular('LossPi', average_only=True)
                    self.logger.log_tabular('LossQ1', average_only=True)
                    self.logger.log_tabular('LossQ2', average_only=True)
                    self.logger.log_tabular('LossV', average_only=True)
                    self.logger.dump_tabular()