Example #1
0
    def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward')
        self.done = tf.placeholder(tf.float32,
                                   shape=(None, ),
                                   name='done_flag')

        # Actor: action probabilities
        self.actor = dense_nn(self.s,
                              self.layer_sizes + [self.act_size],
                              name='actor')
        self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1))
        self.actor_proba = tf.nn.softmax(self.actor)
        self.actor_vars = self.scope_vars('actor')

        # Critic: action value (V value)
        self.critic = dense_nn(self.s, self.layer_sizes + [1], name='critic')
        self.critic_next = dense_nn(self.s_next,
                                    self.layer_sizes + [1],
                                    name='critic',
                                    reuse=True)
        self.critic_vars = self.scope_vars('critic')

        # TD target
        self.td_target = self.r + self.gamma * tf.squeeze(
            self.critic_next) * (1.0 - self.done)
        self.td_error = self.td_target - tf.squeeze(self.critic)
    def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
        self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.s_next = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None,), name='reward')
        self.done = tf.placeholder(tf.float32, shape=(None,), name='done_flag')

        self.old_logp_a = tf.placeholder(tf.float32, shape=(None,), name='old_logp_actor')
        self.R = tf.placeholder(tf.float32, shape=(None,), name='return')
        self.adv = tf.placeholder(tf.float32, shape=(None,), name='adv')

        # this is a policy with value
        with tf.variable_scope('ppo_model'):

            #latent state from which policy distribution parameters should be inferred
            self.latent = dense_nn(self.s, self.actor_layers[:-1], name='prob_dist')
            self.actor = dense_nn(self.latent, self.actor_layers[-1:] + [self.act_size],
                                name='actor')
            self.actor_proba = tf.nn.softmax(self.actor)
            a_ohe = tf.one_hot(self.a, self.act_size, 1.0, 0.0, name='action_ohe')
            self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe,
                                        reduction_indices=-1, name='new_logp_actor')

            self.v = tf.squeeze(dense_nn(self.latent, self.actor_layers[-1:] + [1], name='value'))

            self.params = self.scope_vars('ppo_model')
Example #3
0
    def _build_networks(self):
        """For continuous action space.
        """
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.float32,
                                shape=[None] + self.act_dim,
                                name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=[
            None,
        ], name='reward')

        with tf.variable_scope('primary'):
            # Actor: deterministic policy mu(s) outputs one action vector.
            self.mu = dense_nn(self.s,
                               self.actor_layers + self.act_dim,
                               output_fn=tf.nn.tanh,
                               name='mu')
            # Critic: action value, Q(s, a)
            self.Q = dense_nn(tf.concat([self.s, self.a], axis=1),
                              self.critic_layers + [1],
                              name='Q')
            # We want to train mu network to maximize Q value that is estimated by our critic;
            # this is only used for training.
            self.Q_mu = dense_nn(tf.concat([self.s, self.mu], axis=1),
                                 self.critic_layers + [1],
                                 name='Q',
                                 reuse=True)

        with tf.variable_scope('target'):
            # Clone target networks.
            self.mu_target = dense_nn(self.s_next,
                                      self.actor_layers + self.act_dim,
                                      output_fn=tf.nn.tanh,
                                      name='mu')
            self.Q_target = dense_nn(tf.concat([self.s_next, self.mu_target],
                                               axis=1),
                                     self.critic_layers + [1],
                                     name='Q')

        self.Q_vars = self.scope_vars('primary/Q')
        self.mu_vars = self.scope_vars('primary/mu')

        # sanity check
        self.primary_vars = self.Q_vars + self.mu_vars
        self.target_vars = self.scope_vars('target/Q') + self.scope_vars(
            'target/mu')
        assert len(self.primary_vars) == len(self.target_vars)
    def build(self):
        self.lr = tf.placeholder(tf.float32, shape=None, name='learning_rate')

        # Inputs
        self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
        self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.returns = tf.placeholder(tf.float32, shape=(None,), name='return')

        # Build network
        self.pi = dense_nn(self.s, self.layer_sizes + [self.act_size], name='pi_network')
        self.sampled_actions = tf.squeeze(tf.multinomial(self.pi, 1))
        self.pi_vars = self.scope_vars('pi_network')

        if self.baseline:
            # State value estimation as the baseline
            self.v = dense_nn(self.s, self.layer_sizes + [1], name='v_network')
            self.target = self.returns - self.v  # advantage

            with tf.variable_scope('v_optimize'):
                self.loss_v = tf.reduce_mean(tf.squared_difference(self.v, self.returns))
                self.optim_v = tf.train.AdamOptimizer(self.lr).minimize(self.loss_v, name='adam_optim_v')
        else:
            self.target = tf.identity(self.returns)

        with tf.variable_scope('pi_optimize'):
            self.loss_pi = tf.reduce_mean(
                tf.stop_gradient(self.target) * tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.pi, labels=self.a), name='loss_pi')
            # self.optim_pi = tf.train.AdamOptimizer(self.lr)
            # self.grads_pi = self.optim_pi.compute_gradients(self.loss_pi, self.pi_vars)
            # self.train_pi_op = self.optim_pi.apply_gradients(self.grads_pi)
            self.optim_pi = tf.train.AdamOptimizer(self.lr).minimize(self.loss_pi, name='adam_optim_pi')

        with tf.variable_scope('summary'):
            self.loss_pi_summ = tf.summary.scalar('loss_pi', self.loss_pi)

            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')
            self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward)
            summ_list = [self.loss_pi_summ, self.ep_reward_summ]

            if self.baseline:
                self.loss_v_summ = tf.summary.scalar('loss_v', self.loss_v)
                summ_list.append(self.loss_v_summ)

            self.merged_summary = tf.summary.merge(summ_list)

        if self.baseline:
            self.train_ops = [self.optim_pi, self.optim_v]
        else:
            self.train_ops = [self.optim_pi]

        self.sess.run(tf.global_variables_initializer())
Example #5
0
    def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward')
        self.done = tf.placeholder(tf.float32,
                                   shape=(None, ),
                                   name='done_flag')

        self.old_logp_a = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='old_logp_actor')
        self.v_target = tf.placeholder(tf.float32,
                                       shape=(None, ),
                                       name='v_target')
        self.adv = tf.placeholder(tf.float32, shape=(None, ), name='return')

        with tf.variable_scope('actor'):
            # Actor: action probabilities
            self.actor = dense_nn(self.s,
                                  self.actor_layers + [self.act_size],
                                  name='actor')
            self.actor_proba = tf.nn.softmax(self.actor)
            a_ohe = tf.one_hot(self.a,
                               self.act_size,
                               1.0,
                               0.0,
                               name='action_ohe')
            self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe,
                                        reduction_indices=-1,
                                        name='new_logp_actor')
            self.actor_vars = self.scope_vars('actor')

        with tf.variable_scope('critic'):
            # Critic: action value (V value)
            self.critic = tf.squeeze(
                dense_nn(self.s, self.critic_layers + [1], name='critic'))
            self.critic_next = tf.squeeze(
                dense_nn(self.s_next,
                         self.critic_layers + [1],
                         name='critic',
                         reuse=True))
            self.critic_vars = self.scope_vars('critic')
Example #6
0
    def create_q_networks(self):
        # The first dimension should have batch_size * step_size
        self.states = tf.placeholder(tf.float32,
                                     shape=(None, *self.state_dim),
                                     name='state')
        self.states_next = tf.placeholder(tf.float32,
                                          shape=(None, *self.state_dim),
                                          name='state_next')
        self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.actions_next = tf.placeholder(tf.int32,
                                           shape=(None, ),
                                           name='action_next')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, ),
                                      name='reward')
        self.done_flags = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='done')

        # The output is a probability distribution over all the actions.

        net_class, net_params = self._extract_network_params()

        if self.dueling:
            self.q_hidden = net_class(self.states,
                                      self.layer_sizes[:-1],
                                      name='Q_primary',
                                      **net_params)
            self.adv = dense_nn(self.q_hidden,
                                self.layer_sizes[-1:] + [self.act_size],
                                name='Q_primary_adv')
            self.v = dense_nn(self.q_hidden,
                              self.layer_sizes[-1:] + [1],
                              name='Q_primary_v')

            # Average Dueling
            self.q = self.v + (self.adv - tf.reduce_mean(
                self.adv, reduction_indices=1, keep_dims=True))

            self.q_target_hidden = net_class(self.states_next,
                                             self.layer_sizes[:-1],
                                             name='Q_target',
                                             **net_params)
            self.adv_target = dense_nn(self.q_target_hidden,
                                       self.layer_sizes[-1:] + [self.act_size],
                                       name='Q_target_adv')
            self.v_target = dense_nn(self.q_target_hidden,
                                     self.layer_sizes[-1:] + [1],
                                     name='Q_target_v')

            # Average Dueling
            self.q_target = self.v_target + (self.adv_target - tf.reduce_mean(
                self.adv_target, reduction_indices=1, keep_dims=True))

        else:
            self.q = net_class(self.states,
                               self.layer_sizes + [self.act_size],
                               name='Q_primary',
                               **net_params)
            self.q_target = net_class(self.states_next,
                                      self.layer_sizes + [self.act_size],
                                      name='Q_target',
                                      **net_params)

        # The primary and target Q networks should match.
        self.q_vars = self.scope_vars('Q_primary')
        self.q_target_vars = self.scope_vars('Q_target')
        assert len(self.q_vars) == len(
            self.q_target_vars), "Two Q-networks are not same."
Example #7
0
    def build(self):
        self.learning_rate_c = tf.placeholder(tf.float32,
                                              shape=None,
                                              name='learning_rate_c')
        self.learning_rate_a = tf.placeholder(tf.float32,
                                              shape=None,
                                              name='learning_rate_a')

        # Inputs
        self.states = tf.placeholder(tf.float32,
                                     shape=(None, self.obs_size),
                                     name='state')
        self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, ),
                                      name='reward')
        self.td_targets = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='td_target')

        # Actor: action probabilities
        self.actor = dense_nn(self.states,
                              self.layer_sizes + [self.act_size],
                              name='actor')
        self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1))
        self.actor_proba = tf.nn.softmax(self.actor)
        self.actor_vars = self.scope_vars('actor')

        # Critic: action value (Q-value)
        self.critic = dense_nn(self.states,
                               self.layer_sizes + [1],
                               name='critic')
        self.critic_vars = self.scope_vars('critic')

        action_ohe = tf.one_hot(self.actions,
                                self.act_size,
                                1.0,
                                0.0,
                                name='action_one_hot')
        self.pred_value = tf.reduce_sum(self.critic * action_ohe,
                                        reduction_indices=-1,
                                        name='q_acted')
        self.td_errors = self.td_targets - tf.reshape(self.pred_value, [-1])

        with tf.variable_scope('critic_train'):
            # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars])
            self.loss_c = tf.reduce_mean(tf.square(
                self.td_errors))  # + 0.001 * self.reg_c
            self.optim_c = tf.train.AdamOptimizer(self.learning_rate_c)
            self.grads_c = self.optim_c.compute_gradients(
                self.loss_c, self.critic_vars)
            if self.grad_clip_norm:
                self.grads_c = [(tf.clip_by_norm(grad,
                                                 self.grad_clip_norm), var)
                                for grad, var in self.grads_c]

            self.train_op_c = self.optim_c.apply_gradients(self.grads_c)

        with tf.variable_scope('actor_train'):
            # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars])
            # self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor))
            self.loss_a = tf.reduce_mean(
                tf.stop_gradient(self.td_errors) *
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.actor, labels=self.actions),
                name='loss_actor')  # + 0.001 * self.reg_a
            self.optim_a = tf.train.AdamOptimizer(self.learning_rate_a)
            self.grads_a = self.optim_a.compute_gradients(
                self.loss_a, self.actor_vars)
            if self.grad_clip_norm:
                self.grads_a = [(tf.clip_by_norm(grad,
                                                 self.grad_clip_norm), var)
                                for grad, var in self.grads_a]

            self.train_op_a = self.optim_a.apply_gradients(self.grads_a)

        with tf.variable_scope('summary'):
            self.grads_a_summ = [
                tf.summary.scalar('grads/a_' + var.name, tf.norm(grad))
                for grad, var in self.grads_a if grad is not None
            ]
            self.grads_c_summ = [
                tf.summary.scalar('grads/c_' + var.name, tf.norm(grad))
                for grad, var in self.grads_c if grad is not None
            ]
            self.loss_c_summ = tf.summary.scalar('loss/critic', self.loss_c)
            self.loss_a_summ = tf.summary.scalar('loss/actor', self.loss_a)

            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')
            self.ep_reward_summ = tf.summary.scalar('episode_reward',
                                                    self.ep_reward)

            self.merged_summary = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)

        self.train_ops = [self.train_op_a, self.train_op_c]

        self.sess.run(tf.global_variables_initializer())