Beispiel #1
0
 def entropy(self):
     rescaled_logits = self.logits - tf.reduce_max(
         self.logits, axis=-1, keepdims=True)
     exp_logits = tf.exp(rescaled_logits)
     z = tf.reduce_sum(exp_logits, axis=-1, keepdims=True)
     p = exp_logits / z
     return tf.reduce_sum(p * (tf.log(z) - rescaled_logits),
                          axis=-1,
                          keepdims=True)
Beispiel #2
0
def calc_pi_loss(logic_outs, actions, advantages):
    """Calculate policy gradient loss."""
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=actions, logits=logic_outs)
    advantages = tf.stop_gradient(advantages)
    pg_loss_per_step = cross_entropy * advantages
    return tf.reduce_sum(pg_loss_per_step)
Beispiel #3
0
 def kl(self, other):
     assert isinstance(other,
                       CategoricalDist), 'Distribution type not match.'
     rescaled_logits_self = self.logits - tf.reduce_max(
         self.logits, axis=-1, keepdims=True)
     rescaled_logits_other = other.logits - tf.reduce_max(
         other.logits, axis=-1, keepdims=True)
     exp_logits_self = tf.exp(rescaled_logits_self)
     exp_logits_other = tf.exp(rescaled_logits_other)
     z_self = tf.reduce_sum(exp_logits_self, axis=-1, keepdims=True)
     z_other = tf.reduce_sum(exp_logits_other, axis=-1, keepdims=True)
     p = exp_logits_self / z_self
     return tf.reduce_sum(p * (rescaled_logits_self - tf.log(z_self) -
                               rescaled_logits_other + tf.log(z_other)),
                          axis=-1,
                          keepdims=True)
Beispiel #4
0
 def kl(self, other):
     assert isinstance(other,
                       DiagGaussianDist), 'Distribution type not match.'
     return tf.reduce_sum(
         (tf.square(self.std) + tf.square(self.mean - other.mean)) /
         (2.0 * tf.square(other.std)) + other.log_std - self.log_std - 0.5,
         axis=-1,
         keepdims=True)
Beispiel #5
0
 def gather_custom(self, inputs, indices):
     indices = tf.cast(indices, tf.uint8)
     one_hot = tf.squeeze(
         tf.one_hot(indices=indices, depth=self.n_actions, on_value=1.,
                    off_value=0., axis=-1, dtype=tf.float32),
         axis=-2)
     mul_test = tf.multiply(inputs, one_hot)
     # reduce_sum_val = tf.reduce_sum(mul_test, axis=-1, keep_dims=True)
     reduce_sum_val = tf.reduce_sum(mul_test, axis=-1)
     return reduce_sum_val
Beispiel #6
0
    def build_train_graph(self):
        """
        Build train graph.

        Because of the different seq_max(1 vs limit),
        train graph cannot connect-up to actor.graph directly.
        Hence, we build an explore sub-graph and train sub-graph,
        which sync with tf.assign between two collections.
        :return:
        """
        with self.graph.as_default():
            with tf.variable_scope("eval_agent"):
                trajectory_agent_outs, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    seq_max=self.fix_seq_length + 1,  # importance
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,  # total trajectory, needn't hold hidden
                )

            with tf.variable_scope("target_agent"):
                tar_agent_outs_tmp, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    # fix value, different between explore and train
                    seq_max=self.fix_seq_length + 1,
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,
                )
                target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp)

            _eval_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent")
            _target_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent")

            with tf.variable_scope("soft_replacement"):
                self.agent_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_agent_paras,
                                                    _eval_agent_paras)]

                self.agent_explore_replace_op = [
                    tf.assign(t, e) for t, e in zip(self._explore_paras,
                                                    _eval_agent_paras)
                ]

            self._print_trainable_var_name(
                _eval_agent_paras=_eval_agent_paras,
                _target_agent_paras=_target_agent_paras,
                _explore_paras=self._explore_paras,
            )

            # agent out to max q values
            # Calculate estimated Q-Values ----------------
            mac_out = tf.reshape(
                trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            logging.debug("mac_out: {}".format(mac_out))
            chosen_action_qvals = self.gather_custom(mac_out[:, :-1],
                                                     self.ph_actions)

            # Calculate the Q-Values necessary for the target -----------
            target_mac_out = tf.reshape(
                target_trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            target_mac_out = target_mac_out[:, 1:]

            # Mask out unavailable actions
            # target_mac_out[avail_actions[:, 1:] == 0] = -9999999
            indices = tf.equal(self.ph_avail_action[:, 1:], 0)
            mask_val = tf.tile(
                [[[[-999999.0]]]],
                [
                    self.batch_size,
                    self.fix_seq_length,
                    self.n_agents,
                    self.avail_action_num,
                ],
            )
            logging.debug("indices:{}, mask_val:{}, target mac out:{}".format(
                indices, mask_val, target_mac_out))

            target_mac_out = tf.where(indices, mask_val, target_mac_out)

            if self.use_double_q:
                # Get actions that maximise live Q (for double q-learning)
                mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:]))
                mac_out_detach = tf.where(indices, mask_val, mac_out_detach)
                cur_max_actions = tf.expand_dims(
                    tf.argmax(mac_out_detach, axis=-1), -1)
                target_max_qvals = self.gather_custom(target_mac_out,
                                                      cur_max_actions)
            else:
                target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1])

            # eval mixer ---------------
            with tf.variable_scope("eval_mixer"):
                self.q_tot = self._build_mix_net2(chosen_action_qvals,
                                                  self.ph_train_states)

            with tf.variable_scope("target_mixer"):
                q_tot_tmp = self._build_mix_net2(target_max_qvals,
                                                 self.ph_train_target_states)
                self.target_q_tot = tf.stop_gradient(q_tot_tmp)

            _eval_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer")
            _target_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer")

            with tf.variable_scope("soft_replacement"):
                self.mix_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_mix_paras,
                                                    _eval_mix_paras)]

            self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras,
                                           _target_mix_paras=_target_mix_paras)

            # Calculate 1-step Q-Learning targets
            targets = (self.ph_rewards +
                       self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot)

            # Td-error
            td_error = self.q_tot - tf.stop_gradient(targets)

            # mask = mask.expand_as(td_error)  #fixme: default as same shape!

            # 0-out the targets that came from padded data
            masked_td_error = tf.multiply(td_error, self.ph_mask)

            self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask)

            # Optimise
            optimizer = tf.train.RMSPropOptimizer(
                self.lr, decay=0.95, epsilon=1.5e-7, centered=True)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            capped_gvs = [(
                grad if grad is None else tf.clip_by_norm(
                    grad, clip_norm=self.grad_norm_clip),
                var,
            ) for grad, var in grads_and_vars]
            self.grad_update = optimizer.apply_gradients(capped_gvs)
Beispiel #7
0
def calc_entropy_loss(logic_outs):
    """Calculate entropy loss."""
    pi = tf.nn.softmax(logic_outs)
    log_pi = tf.nn.log_softmax(logic_outs)
    entropy_per_step = tf.reduce_sum(-pi * log_pi, axis=-1)
    return -tf.reduce_sum(entropy_per_step)
Beispiel #8
0
def calc_baseline_loss(advantages):
    """Calculate the baseline loss."""
    return 0.5 * tf.reduce_sum(tf.square(advantages))
Beispiel #9
0
 def entropy(self):
     return tf.reduce_sum(self.log_std + 0.5 * (np.log(2.0 * np.pi) + 1.0),
                          axis=-1,
                          keepdims=True)
Beispiel #10
0
 def neglog_prob(self, x):
     return 0.5 * np.log(2.0 * np.pi) * tf.cast((tf.shape(x)[-1]), tf.float32) + \
         0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1, keepdims=True) + \
         tf.reduce_sum(self.log_std, axis=-1, keepdims=True)