def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward')
        self.done = tf.placeholder(tf.float32,
                                   shape=(None, ),
                                   name='done_flag')

        # Actor: action probabilities
        self.actor = dense_nn(self.s,
                              self.layer_sizes + [self.act_size],
                              name='actor')
        self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1))
        self.actor_proba = tf.nn.softmax(self.actor)
        self.actor_vars = self.scope_vars('actor')

        # Critic: action value (V value)
        self.critic = dense_nn(self.s, self.layer_sizes + [1], name='critic')
        self.critic_next = dense_nn(self.s_next,
                                    self.layer_sizes + [1],
                                    name='critic',
                                    reuse=True)
        self.critic_vars = self.scope_vars('critic')

        # TD target
        self.td_target = self.r + self.gamma * tf.squeeze(
            self.critic_next) * (1.0 - self.done)
        self.td_error = self.td_target - tf.squeeze(self.critic)
Example #2
0
    def _build_networks(self):
        """For continuous action space.
        """
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.float32,
                                shape=[None] + self.act_dim,
                                name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=[
            None,
        ], name='reward')

        with tf.variable_scope('primary'):
            # Actor: deterministic policy mu(s) outputs one action vector.
            self.mu = dense_nn(self.s,
                               self.actor_layers + self.act_dim,
                               output_fn=tf.nn.tanh,
                               name='mu')
            # Critic: action value, Q(s, a)
            self.Q = dense_nn(tf.concat([self.s, self.a], axis=1),
                              self.critic_layers + [1],
                              name='Q')
            # We want to train mu network to maximize Q value that is estimated by our critic;
            # this is only used for training.
            self.Q_mu = dense_nn(tf.concat([self.s, self.mu], axis=1),
                                 self.critic_layers + [1],
                                 name='Q',
                                 reuse=True)

        with tf.variable_scope('target'):
            # Clone target networks.
            self.mu_target = dense_nn(self.s_next,
                                      self.actor_layers + self.act_dim,
                                      output_fn=tf.nn.tanh,
                                      name='mu')
            self.Q_target = dense_nn(tf.concat([self.s_next, self.mu_target],
                                               axis=1),
                                     self.critic_layers + [1],
                                     name='Q')

        self.Q_vars = self.scope_vars('primary/Q')
        self.mu_vars = self.scope_vars('primary/mu')

        # sanity check
        self.primary_vars = self.Q_vars + self.mu_vars
        self.target_vars = self.scope_vars('target/Q') + self.scope_vars(
            'target/mu')
        assert len(self.primary_vars) == len(self.target_vars)
Example #3
0
    def build(self):
        self.lr = tf.placeholder(tf.float32, shape=None, name='learning_rate')

        # Inputs
        self.s = tf.placeholder(tf.float32, shape=[None] + self.state_dim, name='state')
        self.a = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.returns = tf.placeholder(tf.float32, shape=(None,), name='return')

        # Build network
        self.pi = dense_nn(self.s, self.layer_sizes + [self.act_size], name='pi_network')
        self.sampled_actions = tf.squeeze(tf.multinomial(self.pi, 1))
        self.pi_vars = self.scope_vars('pi_network')

        if self.baseline:
            # State value estimation as the baseline
            self.v = dense_nn(self.s, self.layer_sizes + [1], name='v_network')
            self.target = self.returns - self.v  # advantage

            with tf.variable_scope('v_optimize'):
                self.loss_v = tf.reduce_mean(tf.squared_difference(self.v, self.returns))
                self.optim_v = tf.train.AdamOptimizer(self.lr).minimize(self.loss_v, name='adam_optim_v')
        else:
            self.target = tf.identity(self.returns)

        with tf.variable_scope('pi_optimize'):
            self.loss_pi = tf.reduce_mean(
                tf.stop_gradient(self.target) * tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=self.pi, labels=self.a), name='loss_pi')
            # self.optim_pi = tf.train.AdamOptimizer(self.lr)
            # self.grads_pi = self.optim_pi.compute_gradients(self.loss_pi, self.pi_vars)
            # self.train_pi_op = self.optim_pi.apply_gradients(self.grads_pi)
            self.optim_pi = tf.train.AdamOptimizer(self.lr).minimize(self.loss_pi, name='adam_optim_pi')

        with tf.variable_scope('summary'):
            self.loss_pi_summ = tf.summary.scalar('loss_pi', self.loss_pi)

            self.ep_reward = tf.placeholder(tf.float32, name='episode_reward')
            self.ep_reward_summ = tf.summary.scalar('episode_reward', self.ep_reward)
            summ_list = [self.loss_pi_summ, self.ep_reward_summ]

            if self.baseline:
                self.loss_v_summ = tf.summary.scalar('loss_v', self.loss_v)
                summ_list.append(self.loss_v_summ)

            self.merged_summary = tf.summary.merge(summ_list)

        if self.baseline:
            self.train_ops = [self.optim_pi, self.optim_v]
        else:
            self.train_ops = [self.optim_pi]

        self.sess.run(tf.global_variables_initializer())
Example #4
0
    def _build_networks(self):
        # Define input placeholders
        self.s = tf.placeholder(tf.float32,
                                shape=[None] + self.state_dim,
                                name='state')
        self.a = tf.placeholder(tf.int32, shape=(None, ), name='action')
        self.s_next = tf.placeholder(tf.float32,
                                     shape=[None] + self.state_dim,
                                     name='next_state')
        self.r = tf.placeholder(tf.float32, shape=(None, ), name='reward')
        self.done = tf.placeholder(tf.float32,
                                   shape=(None, ),
                                   name='done_flag')

        self.old_logp_a = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='old_logp_actor')
        self.v_target = tf.placeholder(tf.float32,
                                       shape=(None, ),
                                       name='v_target')
        self.adv = tf.placeholder(tf.float32, shape=(None, ), name='return')

        with tf.variable_scope('actor'):
            # Actor: action probabilities
            self.actor = dense_nn(self.s,
                                  self.actor_layers + [self.act_size],
                                  name='actor')
            self.actor_proba = tf.nn.softmax(self.actor)
            a_ohe = tf.one_hot(self.a,
                               self.act_size,
                               1.0,
                               0.0,
                               name='action_ohe')
            self.logp_a = tf.reduce_sum(tf.log(self.actor_proba) * a_ohe,
                                        reduction_indices=-1,
                                        name='new_logp_actor')
            self.actor_vars = self.scope_vars('actor')

        with tf.variable_scope('critic'):
            # Critic: action value (V value)
            self.critic = tf.squeeze(
                dense_nn(self.s, self.critic_layers + [1], name='critic'))
            self.critic_next = tf.squeeze(
                dense_nn(self.s_next,
                         self.critic_layers + [1],
                         name='critic',
                         reuse=True))
            self.critic_vars = self.scope_vars('critic')
Example #5
0
    def create_q_networks(self):
        # The first dimension should have batch_size * step_size
        self.states = tf.placeholder(tf.float32, shape=(None, *self.state_dim), name='state')
        self.states_next = tf.placeholder(tf.float32, shape=(None, *self.state_dim),
                                          name='state_next')
        self.actions = tf.placeholder(tf.int32, shape=(None,), name='action')
        self.actions_next = tf.placeholder(tf.int32, shape=(None,), name='action_next')
        self.rewards = tf.placeholder(tf.float32, shape=(None,), name='reward')
        self.done_flags = tf.placeholder(tf.float32, shape=(None,), name='done')

        # The output is a probability distribution over all the actions.

        net_class, net_params = self._extract_network_params()

        if self.dueling:
            self.q_hidden = net_class(self.states, self.layer_sizes[:-1], name='Q_primary',
                                      **net_params)
            self.adv = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [self.act_size],
                                name='Q_primary_adv')
            self.v = dense_nn(self.q_hidden, self.layer_sizes[-1:] + [1], name='Q_primary_v')

            # Average Dueling
            self.q = self.v + (self.adv - tf.reduce_mean(
                self.adv, reduction_indices=1, keep_dims=True))

            self.q_target_hidden = net_class(self.states_next, self.layer_sizes[:-1], name='Q_target',
                                             **net_params)
            self.adv_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [self.act_size],
                                       name='Q_target_adv')
            self.v_target = dense_nn(self.q_target_hidden, self.layer_sizes[-1:] + [1],
                                     name='Q_target_v')

            # Average Dueling
            self.q_target = self.v_target + (self.adv_target - tf.reduce_mean(
                self.adv_target, reduction_indices=1, keep_dims=True))

        else:
            self.q = net_class(self.states, self.layer_sizes + [self.act_size], name='Q_primary',
                               **net_params)
            self.q_target = net_class(self.states_next, self.layer_sizes + [self.act_size],
                                      name='Q_target', **net_params)

        # The primary and target Q networks should match.
        self.q_vars = self.scope_vars('Q_primary')
        self.q_target_vars = self.scope_vars('Q_target')
        assert len(self.q_vars) == len(self.q_target_vars), "Two Q-networks are not same."