Esempio n. 1
0
def calc_pi_loss(logic_outs, actions, advantages):
    """Calculate policy gradient loss."""
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=actions, logits=logic_outs)
    advantages = tf.stop_gradient(advantages)
    pg_loss_per_step = cross_entropy * advantages
    return tf.reduce_sum(pg_loss_per_step)
Esempio n. 2
0
    def build_train_graph(self):
        """
        Build train graph.

        Because of the different seq_max(1 vs limit),
        train graph cannot connect-up to actor.graph directly.
        Hence, we build an explore sub-graph and train sub-graph,
        which sync with tf.assign between two collections.
        :return:
        """
        with self.graph.as_default():
            with tf.variable_scope("eval_agent"):
                trajectory_agent_outs, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    seq_max=self.fix_seq_length + 1,  # importance
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,  # total trajectory, needn't hold hidden
                )

            with tf.variable_scope("target_agent"):
                tar_agent_outs_tmp, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    # fix value, different between explore and train
                    seq_max=self.fix_seq_length + 1,
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,
                )
                target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp)

            _eval_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent")
            _target_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent")

            with tf.variable_scope("soft_replacement"):
                self.agent_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_agent_paras,
                                                    _eval_agent_paras)]

                self.agent_explore_replace_op = [
                    tf.assign(t, e) for t, e in zip(self._explore_paras,
                                                    _eval_agent_paras)
                ]

            self._print_trainable_var_name(
                _eval_agent_paras=_eval_agent_paras,
                _target_agent_paras=_target_agent_paras,
                _explore_paras=self._explore_paras,
            )

            # agent out to max q values
            # Calculate estimated Q-Values ----------------
            mac_out = tf.reshape(
                trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            logging.debug("mac_out: {}".format(mac_out))
            chosen_action_qvals = self.gather_custom(mac_out[:, :-1],
                                                     self.ph_actions)

            # Calculate the Q-Values necessary for the target -----------
            target_mac_out = tf.reshape(
                target_trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            target_mac_out = target_mac_out[:, 1:]

            # Mask out unavailable actions
            # target_mac_out[avail_actions[:, 1:] == 0] = -9999999
            indices = tf.equal(self.ph_avail_action[:, 1:], 0)
            mask_val = tf.tile(
                [[[[-999999.0]]]],
                [
                    self.batch_size,
                    self.fix_seq_length,
                    self.n_agents,
                    self.avail_action_num,
                ],
            )
            logging.debug("indices:{}, mask_val:{}, target mac out:{}".format(
                indices, mask_val, target_mac_out))

            target_mac_out = tf.where(indices, mask_val, target_mac_out)

            if self.use_double_q:
                # Get actions that maximise live Q (for double q-learning)
                mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:]))
                mac_out_detach = tf.where(indices, mask_val, mac_out_detach)
                cur_max_actions = tf.expand_dims(
                    tf.argmax(mac_out_detach, axis=-1), -1)
                target_max_qvals = self.gather_custom(target_mac_out,
                                                      cur_max_actions)
            else:
                target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1])

            # eval mixer ---------------
            with tf.variable_scope("eval_mixer"):
                self.q_tot = self._build_mix_net2(chosen_action_qvals,
                                                  self.ph_train_states)

            with tf.variable_scope("target_mixer"):
                q_tot_tmp = self._build_mix_net2(target_max_qvals,
                                                 self.ph_train_target_states)
                self.target_q_tot = tf.stop_gradient(q_tot_tmp)

            _eval_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer")
            _target_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer")

            with tf.variable_scope("soft_replacement"):
                self.mix_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_mix_paras,
                                                    _eval_mix_paras)]

            self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras,
                                           _target_mix_paras=_target_mix_paras)

            # Calculate 1-step Q-Learning targets
            targets = (self.ph_rewards +
                       self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot)

            # Td-error
            td_error = self.q_tot - tf.stop_gradient(targets)

            # mask = mask.expand_as(td_error)  #fixme: default as same shape!

            # 0-out the targets that came from padded data
            masked_td_error = tf.multiply(td_error, self.ph_mask)

            self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask)

            # Optimise
            optimizer = tf.train.RMSPropOptimizer(
                self.lr, decay=0.95, epsilon=1.5e-7, centered=True)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            capped_gvs = [(
                grad if grad is None else tf.clip_by_norm(
                    grad, clip_norm=self.grad_norm_clip),
                var,
            ) for grad, var in grads_and_vars]
            self.grad_update = optimizer.apply_gradients(capped_gvs)
Esempio n. 3
0
def scale_gradient(tensor, scale):
    """Scales the gradient for the backward pass."""
    return tensor * scale + tf.stop_gradient(tensor) * (1 - scale)
Esempio n. 4
0
def from_logic_outputs(behaviour_policy_logic_outputs,
                       target_policy_logic_outputs,
                       actions,
                       discounts,
                       rewards,
                       values,
                       bootstrap_value,
                       clip_importance_sampling_threshold=1.0,
                       clip_pg_importance_sampling_threshold=1.0):
    """
    Calculate vtrace with logic outputs.

    :param behaviour_policy_logic_outputs: behaviour_policy_logic_outputs
    :param target_policy_logic_outputs: target_policy_logic_outputs
    :param actions:
    :param discounts:
    :param rewards:
    :param values:
    :param bootstrap_value:
    :param clip_importance_sampling_threshold:
    :param clip_pg_importance_sampling_threshold:
    :return:
    """
    behaviour_policy_logic_outputs = tf.convert_to_tensor(
        behaviour_policy_logic_outputs, dtype=tf.float32)
    target_policy_logic_outputs = tf.convert_to_tensor(
        target_policy_logic_outputs, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)

    # support [T, B, Action_dimension]
    behaviour_policy_logic_outputs.shape.assert_has_rank(3)
    target_policy_logic_outputs.shape.assert_has_rank(3)
    actions.shape.assert_has_rank(2)

    target_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=target_policy_logic_outputs, labels=actions)

    behaviour_log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=behaviour_policy_logic_outputs, labels=actions)

    # log importance sampling weight
    importance_sampling_weights = tf.exp(target_log_prob - behaviour_log_prob)

    clipped_importance_sampling_weight = tf.minimum(
        clip_importance_sampling_threshold, importance_sampling_weights)
    clipped_pg_importance_sampling_weight = tf.minimum(
        clip_pg_importance_sampling_threshold, importance_sampling_weights)

    # coefficient, similar to the 'trace cutting'
    coefficient = tf.minimum(1.0, importance_sampling_weights)

    next_values = tf.concat(
        [values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)

    # temporal difference, as the fixed point
    deltas = clipped_importance_sampling_weight * (
        rewards + discounts * next_values - values)
    sequences = (deltas, discounts, coefficient)

    # calculate Vtrace with tf.scan, and set reverse: True, back --> begin
    def scan_fn(cumulative_value, sequence_item):
        _delta, _discount, _coefficient = sequence_item
        return _delta + _discount * _coefficient * cumulative_value

    last_values = tf.zeros_like(bootstrap_value)
    temporal_difference = tf.scan(
        fn=scan_fn,
        elems=sequences,
        initializer=last_values,
        parallel_iterations=1,
        back_prop=False,
        reverse=True,
    )

    value_of_states = tf.add(temporal_difference, values)
    # Advantage for policy gradient.
    value_of_next_state = tf.concat(
        [value_of_states[1:],
         tf.expand_dims(bootstrap_value, 0)], axis=0)
    pg_advantages = clipped_pg_importance_sampling_weight * (
        rewards + discounts * value_of_next_state - values)

    value_of_states = tf.stop_gradient(value_of_states)
    pg_advantages = tf.stop_gradient(pg_advantages)
    return value_of_states, pg_advantages