Example #1
0
def build_train(actor,
                critic,
                obs_dim,
                num_actions,
                gamma=1.0,
                scope='ddpg',
                tau=0.001,
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # input placeholders
        obs_t_input = tf.placeholder(tf.float32, [None, obs_dim], name='obs_t')
        act_t_ph = tf.placeholder(tf.float32, [None, num_actions],
                                  name='action')
        rew_t_ph = tf.placeholder(tf.float32, [None], name='reward')
        obs_tp1_input = tf.placeholder(tf.float32, [None, obs_dim],
                                       name='obs_tp1')
        done_mask_ph = tf.placeholder(tf.float32, [None], name='done')

        # actor network
        policy_t = actor(obs_t_input, num_actions, scope='actor')
        actor_func_vars = util.scope_vars(util.absolute_scope_name('actor'),
                                          trainable_only=True)

        # target actor network
        policy_tp1 = actor(obs_tp1_input, num_actions, scope='target_actor')
        target_actor_func_vars = util.scope_vars(
            util.absolute_scope_name('target_actor'), trainable_only=True)

        # critic network
        q_t = critic(obs_t_input, act_t_ph, num_actions, scope='critic')
        q_t_with_actor = critic(obs_t_input,
                                policy_t,
                                num_actions,
                                scope='critic',
                                reuse=True)
        critic_func_vars = util.scope_vars(util.absolute_scope_name('critic'),
                                           trainable_only=True)

        # target critic network
        q_tp1 = critic(obs_tp1_input,
                       policy_tp1,
                       num_actions,
                       scope='target_critic')
        target_critic_func_vars = util.scope_vars(
            util.absolute_scope_name('target_critic'), trainable_only=True)

        # loss
        with tf.variable_scope('target_q'):
            v = (1 - done_mask_ph) * gamma * tf.stop_gradient(q_tp1)
            target_q = rew_t_ph + v
        critic_loss = tf.reduce_mean(tf.square(target_q - q_t),
                                     name='critic_loss')
        actor_loss = -tf.reduce_mean(q_t_with_actor, name='actor_loss')

        # optimize operations
        critic_optimizer = tf.train.AdamOptimizer(0.001)
        critic_optimize_expr = critic_optimizer.minimize(
            critic_loss, var_list=critic_func_vars)
        actor_optimizer = tf.train.AdamOptimizer(0.0001)
        actor_optimize_expr = actor_optimizer.minimize(
            actor_loss, var_list=actor_func_vars)

        # update critic target operations
        with tf.variable_scope('update_critic_target'):
            update_critic_target_expr = []
            sorted_vars = sorted(critic_func_vars, key=lambda v: v.name)
            sorted_target_vars = sorted(target_critic_func_vars,
                                        key=lambda v: v.name)
            # assign critic variables to target critic variables
            for var, var_target in zip(sorted_vars, sorted_target_vars):
                new_var = tau * var + (1 - tau) * var_target
                update_critic_target_expr.append(var_target.assign(new_var))
            update_critic_target_expr = tf.group(*update_critic_target_expr)

        # update actor target operations
        with tf.variable_scope('update_actor_target'):
            update_actor_target_expr = []
            sorted_vars = sorted(actor_func_vars, key=lambda v: v.name)
            sorted_target_vars = sorted(target_actor_func_vars,
                                        key=lambda v: v.name)
            # assign actor variables to target actor variables
            for var, var_target in zip(sorted_vars, sorted_target_vars):
                new_var = tau * var + (1 - tau) * var_target
                update_actor_target_expr.append(var_target.assign(new_var))
            update_actor_target_expr = tf.group(*update_actor_target_expr)

        # action theano-style function
        act = util.function(inputs=[obs_t_input], outputs=policy_t)

        # train theano-style function
        train_actor = util.function(inputs=[obs_t_input],
                                    outputs=[actor_loss],
                                    updates=[actor_optimize_expr])
        train_critic = util.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph
        ],
                                     outputs=[critic_loss],
                                     updates=[critic_optimize_expr])

        # update target theano-style function
        update_actor_target = util.function([], [],
                                            updates=[update_actor_target_expr])
        update_critic_target = util.function(
            [], [], updates=[update_critic_target_expr])

        return act, train_actor, train_critic, update_actor_target, update_critic_target
Example #2
0
        step_size_ph = tf.placeholder(tf.int32, [], name='step_size')
        mask_ph = tf.placeholder(tf.float32, [None], name='mask')
        if continous:
            act_t_ph = tf.placeholder(
                tf.float32, [None, num_actions], name='action')
        else:
            act_t_ph = tf.placeholder(tf.int32, [None], name='action')

        # rnn state in tuple
        rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple(
            rnn_state_ph0, rnn_state_ph1)

        policy, value, dist = network(
            obs_t_input, rnn_state_tuple, num_actions, lstm_unit,
            nenvs, step_size, continuous, scope='network', reuse=reuse)
        network_func_vars = util.scope_vars(
            util.absolute_scope_name('network'), trainable_only=True)

        old_policy, old_value, old_dist = network(
            obs_t_input, num_actions, scope='old_network', reuse=reuse)
        old_network_func_vars = util.scope_vars(
            util.absolute_scope_name('old_network'),
            trainable_only=True)

        tmp_policy, tmp_value, tmp_dist = network(
            obs_t_input, num_actions, scope='tmp_network', reuse=reuse)
        tmp_network_func_vars = util.scope_vars(
            util.absolute_scope_name('tmp_network'),
            trainable_only=True)

        # reshape inputs
        advantages = tf.reshape(advantage_t_ph, [-1, 1])
Example #3
0
def build_train(network,
                obs_dim,
                num_actions,
                gamma=1.0,
                epsilon=0.2,
                beta=0.01,
                scope='ppo',
                reuse=None):
    with tf.device('/gpu:0'):
        with tf.variable_scope(scope, reuse=reuse):
            # input placeholders
            obs_t_input = tf.placeholder(tf.float32, [None, obs_dim],
                                         name='obs_t')
            act_t_ph = tf.placeholder(tf.float32, [None, num_actions],
                                      name='action')
            return_t_ph = tf.placeholder(tf.float32, [None, 1], name='return')
            advantage_t_ph = tf.placeholder(tf.float32, [None, 1],
                                            name='advantage')

            policy, value, dist = network(obs_t_input,
                                          num_actions,
                                          scope='network',
                                          reuse=reuse)
            network_func_vars = util.scope_vars(
                util.absolute_scope_name('network'), trainable_only=True)

            old_policy, old_value, old_dist = network(obs_t_input,
                                                      num_actions,
                                                      scope='old_network',
                                                      reuse=reuse)
            old_network_func_vars = util.scope_vars(
                util.absolute_scope_name('old_network'), trainable_only=True)

            tmp_policy, tmp_value, tmp_dist = network(obs_t_input,
                                                      num_actions,
                                                      scope='tmp_network',
                                                      reuse=reuse)
            tmp_network_func_vars = util.scope_vars(
                util.absolute_scope_name('tmp_network'), trainable_only=True)

            # clipped surrogate objective
            cur_policy = dist.log_prob(act_t_ph + 1e-5)
            old_policy = old_dist.log_prob(act_t_ph + 1e-5)
            ratio = tf.exp(cur_policy - old_policy)
            clipped_ratio = tf.clip_by_value(ratio, 1.0 - epsilon,
                                             1.0 + epsilon)
            surrogate = -tf.reduce_mean(
                tf.minimum(ratio, clipped_ratio) * advantage_t_ph,
                name='surrogate')

            with tf.variable_scope('loss'):
                # value network loss
                value_loss = tf.reduce_mean(tf.square(value - return_t_ph))

                # entropy penalty for exploration
                entropy = tf.reduce_mean(dist.entropy())
                penalty = -beta * entropy

                # total loss
                loss = surrogate + value_loss + penalty

            # optimize operations
            optimizer = tf.train.AdamOptimizer(3 * 1e-4)
            optimize_expr = optimizer.minimize(loss,
                                               var_list=network_func_vars)

            # update old network operations
            with tf.variable_scope('update_old_network'):
                update_old_expr = []
                sorted_tmp_vars = sorted(tmp_network_func_vars,
                                         key=lambda v: v.name)
                sorted_old_vars = sorted(old_network_func_vars,
                                         key=lambda v: v.name)
                for var_tmp, var_old in zip(sorted_tmp_vars, sorted_old_vars):
                    update_old_expr.append(var_old.assign(var_tmp))
                update_old_expr = tf.group(*update_old_expr)

            # update tmp network operations
            with tf.variable_scope('update_tmp_network'):
                update_tmp_expr = []
                sorted_vars = sorted(network_func_vars, key=lambda v: v.name)
                sorted_tmp_vars = sorted(tmp_network_func_vars,
                                         key=lambda v: v.name)
                for var, var_tmp in zip(sorted_vars, sorted_tmp_vars):
                    update_tmp_expr.append(var_tmp.assign(var))
                update_tmp_expr = tf.group(*update_tmp_expr)

            # action theano-style function
            act = util.function(inputs=[obs_t_input], outputs=[policy, value])

            # train theano-style function
            train = util.function(
                inputs=[obs_t_input, act_t_ph, return_t_ph, advantage_t_ph],
                outputs=[loss, value_loss,
                         tf.reduce_mean(ratio)],
                updates=[optimize_expr])

            # update target theano-style function
            update_old = util.function([], [], updates=[update_old_expr])
            backup_current = util.function([], [], updates=[update_tmp_expr])

            return act, train, update_old, backup_current