def extra_loss(self, policy_loss, loss_inputs, stats):
     obs = restore_original_dimensions(loss_inputs["obs"], self.obs_space,
                                       self.framework)["board"]
     previous_round = restore_original_dimensions(
         loss_inputs["new_obs"], self.obs_space,
         self.framework)["previous_round"]
     previous_round = previous_round[:, :self.inception_steps]
     previous_round = tf.reshape(previous_round, [
         tf.shape(previous_round)[0],
         previous_round.shape[1] * previous_round.shape[2]
     ])  # reshape so all hands are in one vector
     previous_round = tf.math.divide_no_nan(
         previous_round, tf.expand_dims(tf.reduce_sum(previous_round, 1),
                                        1))
     obs_module_out, state_1 = self.obs_module({"obs": obs}, None, None)
     aux_module_out, state_2 = self.aux_module({"obs": obs_module_out},
                                               state_1, None)
     concat = tf.concat([
         tf.one_hot(tf.stop_gradient(loss_inputs["actions"]),
                    self.action_space.n), aux_module_out
     ],
                        axis=1)
     aux_head_out, _ = self.aux_head({"obs": concat}, state_2, None)
     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
         labels=tf.stop_gradient(previous_round), logits=aux_head_out)
     policy_inference_loss = tf.reduce_mean(cross_entropy)
     combined_loss = self.aux_loss_formula(policy_loss,
                                           policy_inference_loss)
     stats.update({
         "combined_loss": combined_loss,
         "policy_inference_loss": policy_inference_loss
     })
     return combined_loss
Esempio n. 2
0
    def learn_on_batch(self, postprocessed_batch):
        train_batch = self._lazy_tensor_dict(postprocessed_batch)
        unflattened_obs = restore_original_dimensions(
            train_batch[SampleBatch.CUR_OBS], self.observation_space,
            self.framework)

        info = {}

        start = time.time()
        self.model.partial_fit(unflattened_obs,
                               train_batch[SampleBatch.REWARDS],
                               train_batch[SampleBatch.ACTIONS])

        infos = postprocessed_batch["infos"]
        if "regret" in infos[0]:
            regret = sum(
                row["infos"]["regret"] for row in postprocessed_batch.rows())
            self.regrets.append(regret)
            info["cumulative_regret"] = sum(self.regrets)
        else:
            if log_once("no_regrets"):
                logger.warning("The env did not report `regret` values in "
                               "its `info` return, ignoring.")
        info["update_latency"] = time.time() - start
        return {LEARNER_STATS_KEY: info}
Esempio n. 3
0
def new_ppo_surrogate_loss(policy, model, dist_class, train_batch):
    if policy.num_adversaries > 1:
        kl_diff_loss = setup_kl_loss(policy, model, dist_class, train_batch)

    # zero out the loss elements where you weren't actually acting
    original_space = restore_original_dimensions(train_batch['obs'],
                                                 model.obs_space)
    is_active = original_space['is_active']

    # extract the ppo_surrogate_loss before the mean is taken
    ppo_custom_surrogate_loss(policy, model, dist_class, train_batch)
    pre_mean_loss = policy.loss_obj.pre_mean_loss

    def reduce_mean_valid(t):
        return tf.reduce_mean(tf.boolean_mask(t, policy.loss_obj.valid_mask))

    # This mask combines both the valid mask and a check for when we were actually active in the env
    combined_mask = tf.math.logical_and(
        policy.loss_obj.valid_mask, tf.cast(tf.squeeze(is_active, -1),
                                            tf.bool))
    standard_loss = tf.reduce_mean(
        tf.boolean_mask(pre_mean_loss, combined_mask))

    # Since we are happy to evaluate the kl diff over obs in which we weren't active, we only mask this
    # with respect to the valid mask, which tracks padding for RNNs
    if policy.num_adversaries > 1 and policy.config['kl_diff_weight'] > 0:
        policy.unscaled_kl_loss = kl_diff_loss
        clipped_mean_loss = reduce_mean_valid(
            tf.clip_by_value(kl_diff_loss, 0, policy.kl_diff_clip))
        policy.kl_var = tf.math.reduce_std(kl_diff_loss)
        return -policy.config[
            'kl_diff_weight'] * clipped_mean_loss + standard_loss
    else:
        return standard_loss
Esempio n. 4
0
def imitation_loss(policy, model, dist_class, train_batch):
    original_space = restore_original_dimensions(train_batch['obs'],
                                                 model.obs_space)
    expert_tensor = original_space['expert_action']
    logits, state = model.from_batch(train_batch)
    action_dist = dist_class(logits, model)

    if state:
        max_seq_len = tf.reduce_max(train_batch["seq_lens"])
        mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len)
        mask = tf.reshape(mask, [-1])
    else:
        mask = tf.ones_like(train_batch[Postprocessing.ADVANTAGES],
                            dtype=tf.bool)

    if policy.config['model']['custom_options']["hard_negative_mining"]:
        masked_logp = tf.boolean_mask(action_dist.logp(expert_tensor), mask)
        top_loss, _ = tf.math.top_k(masked_logp,
                                    int(policy.config['sgd_minibatch_size'] /
                                        10))  # todo make this an actual 10%
        top_loss = tf.reduce_sum(top_loss)
        imitation_loss = -tf.reduce_mean(top_loss)

    else:
        # Since we are doing gradient descent, we flip the sign so that we are minimizing the negative log prob
        imitation_loss = -tf.reduce_mean(
            tf.boolean_mask(action_dist.logp(expert_tensor), mask))

    return imitation_loss
Esempio n. 5
0
def get_actions_from_target_net(train_batch, policy, target_q_model,
                                observation_space, action_space):
    restored = restore_original_dimensions(train_batch[SampleBatch.NEXT_OBS],
                                           observation_space,
                                           target_q_model.framework)
    previous_round_obs = {}
    previous_round_obs["board"] = tf.reshape(restored["previous_round"], [
        tf.shape(restored["previous_round"])[0] *
        restored["previous_round"].shape[1],
        restored["previous_round"].shape[2]
    ])
    previous_round_obs["legal_actions"] = tf.reshape(
        restored["previous_round_legal_actions"], [
            tf.shape(restored["previous_round_legal_actions"])[0] *
            restored["previous_round_legal_actions"].shape[1],
            restored["previous_round_legal_actions"].shape[2]
        ])
    target_q_model.forward(
        {
            "obs": previous_round_obs,
            "is_training": policy._get_is_training_placeholder()
        }, [], None)
    q_out = target_q_model.get_q_out()
    previous_round = tf.one_hot(tf.argmax(q_out["value"], 1),
                                policy.action_space.n)
    previous_round = tf.reshape(previous_round, [
        tf.shape(restored["previous_round"])[0],
        restored["previous_round"].shape[1], action_space.n
    ])
    return previous_round
Esempio n. 6
0
 def forward(self, input_dict, hidden_state):
     """Wraps _forward() to unpack flattened Dict and Tuple observations."""
     input_dict["obs"] = input_dict["obs"].float()  # TODO(ekl): avoid cast
     input_dict["obs"] = restore_original_dimensions(
         input_dict["obs"], self.obs_space, tensorlib=torch)
     outputs, features, vf, h = self._forward(input_dict, hidden_state)
     return outputs, features, vf, h
Esempio n. 7
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        with tf.variable_scope(self.scope,
                               reuse=tf.AUTO_REUSE,
                               auxiliary_name_scope=False):
            logits, _ = self._build_layers_v2(
                {
                    "obs":
                    restore_original_dimensions(input_ops["obs"],
                                                self.obs_space)
                }, self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
Esempio n. 8
0
 def forward(self, input_dict, hidden_state):
     """Wraps _forward() to unpack flattened Dict and Tuple observations."""
     input_dict["obs"] = input_dict["obs"].float()  # TODO(ekl): avoid cast
     input_dict["obs"] = restore_original_dimensions(input_dict["obs"],
                                                     self.obs_space,
                                                     tensorlib=torch)
     outputs, features, vf, h = self._forward(input_dict, hidden_state)
     return outputs, features, vf, h
 def extra_loss(self, policy_loss, loss_inputs, stats):
     obs = restore_original_dimensions(loss_inputs["obs"], self.obs_space, self.framework)["board"]
     hidden_hand = restore_original_dimensions(loss_inputs["obs"], self.obs_space, self.framework)[
         "hidden_hand"]
     hidden_hand = tf.reshape(hidden_hand,[tf.shape(hidden_hand)[0] * hidden_hand.shape[1], hidden_hand.shape[2]])  # reshape so all hands are in one batch
     obs_module_out, state_1 = self.obs_module({"obs": obs}, None, None)
     aux_module_out, state_2 = self.aux_module({"obs": obs_module_out}, state_1, None)
     aux_head_out, _ = self.aux_head({"obs": aux_module_out}, state_2, None)
     aux_head_out = tf.reshape(aux_head_out, tf.shape(hidden_hand))
     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
         labels=tf.stop_gradient(hidden_hand),
         logits=aux_head_out)
     hand_inference_loss = tf.reduce_mean(cross_entropy)
     combined_loss = self.aux_loss_formula(policy_loss, hand_inference_loss)
     stats.update({
         "combined_loss": combined_loss,
         "hand_inference_loss": hand_inference_loss
     })
     return combined_loss
Esempio n. 10
0
 def _unpack_observations(self, input_dict):
     restored = input_dict.copy()
     restored["obs"] = restore_original_dimensions(
         input_dict["obs"], self.observation_space, self.framework
     )
     if len(input_dict["obs"].shape) > 2:
         restored["obs_flat"] = flatten(input_dict["obs"], self.framework)
     else:
         restored["obs_flat"] = input_dict["obs"]
     return restored
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.model_config["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops(
            self.model_config["custom_options"].get("expert_size", 1))

        # define a secondary loss by building a graph copy with weight sharing
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        # print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        self.policy_loss = policy_loss
        (action_scores, model_logits,
         dist) = self.get_q_value_distributions(logits)
        model_logits = tf.squeeze(model_logits)
        action_dist = Categorical(model_logits, self.model_config)

        expert_logits = tf.cast(input_ops["actions"], tf.int32)
        expert_action = tf.math.argmax(expert_logits)
        expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs)
        model_action = action_dist.deterministic_sample()
        model_action_one_hot = tf.one_hot(model_action, self.num_outputs)
        model_expert = model_action_one_hot * expert_action_one_hot
        imitation_loss = 0
        loss_type = self.model_config["custom_options"].get("loss", "ce")
        if loss_type == "ce":
            imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits))
        elif loss_type == "kl":
            expert_dist = Categorical(tf.one_hot(expert_logits,\
                self.num_outputs), self.model_config)
            imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist))
        elif loss_type == "dqfd":
            max_value = float("-inf")
            Q_select = model_logits  #  TODO: difference in action_scores,dist and logits
            for a in range(self.num_outputs):
                max_value = tf.maximum(
                    Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32),
                    max_value)
            imitation_loss = tf.reduce_mean(
                1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)]))

        self.imitation_loss = imitation_loss
        total_loss = self.model_config["custom_options"]["lambda1"]*policy_loss\
                     + self.model_config["custom_options"]["lambda2"]\
            * self.imitation_loss
        return total_loss
Esempio n. 12
0
    def __call__(self, input_dict, state=None, seq_lens=None):
        """Call the model with the given input tensors and state.

        This is the method used by RLlib to execute the forward pass. It calls
        forward() internally after unpacking nested observation tensors.

        Custom models should override forward() instead of __call__.

        Arguments:
            input_dict (dict): dictionary of input tensors, including "obs",
                "prev_action", "prev_reward", "is_training"
            state (list): list of state tensors with sizes matching those
                returned by get_initial_state + the batch dimension
            seq_lens (Tensor): 1d tensor holding input sequence lengths

        Returns:
            (outputs, state): The model output tensor of size
                [BATCH, output_spec.size] or a list of tensors corresponding to
                output_spec.shape_list, and a list of state tensors of
                [BATCH, state_size_i].
        """

        restored = input_dict.copy()
        restored["obs"] = restore_original_dimensions(input_dict["obs"],
                                                      self.obs_space,
                                                      self.framework)
        if len(input_dict["obs"].shape) > 2:
            restored["obs_flat"] = flatten(input_dict["obs"], self.framework)
        else:
            restored["obs_flat"] = input_dict["obs"]
        with self.context():
            res = self.forward(restored, state or [], seq_lens)
        if ((not isinstance(res, list) and not isinstance(res, tuple))
                or len(res) != 2):
            raise ValueError(
                "forward() must return a tuple of (output, state) tensors, "
                "got {}".format(res))
        outputs, state = res

        try:
            shape = outputs.shape
        except AttributeError:
            raise ValueError("Output is not a tensor: {}".format(outputs))
        else:
            if len(shape) != 2 or shape[1] != self.num_outputs:
                raise ValueError(
                    "Expected output shape of [None, {}], got {}".format(
                        self.num_outputs, shape))
        if not isinstance(state, list):
            raise ValueError("State output is not a list: {}".format(state))

        self._last_output = outputs
        return outputs, state
Esempio n. 13
0
    def compute_priors_and_value(self, obs):
        obs = convert_to_tensor([self.preprocessor.transform(obs)])
        input_dict = restore_original_dimensions(obs, self.obs_space, "torch")

        with torch.no_grad():
            model_out = self.forward(input_dict, None, [1])
            logits, _ = model_out
            value = self.value_function()
            logits, value = torch.squeeze(logits), torch.squeeze(value)
            priors = nn.Softmax(dim=-1)(logits)

            priors = priors.cpu().numpy()
            value = value.cpu().numpy()

            return priors, value
Esempio n. 14
0
def alpha_zero_loss(policy, model, dist_class, train_batch):
    # get inputs unflattened inputs
    input_dict = restore_original_dimensions(train_batch["obs"],
                                             policy.observation_space, "torch")
    # forward pass in model
    model_out = model.forward(input_dict, None, [1])
    logits, _ = model_out
    values = model.value_function()
    logits, values = torch.squeeze(logits), torch.squeeze(values)
    priors = nn.Softmax(dim=-1)(logits)
    # compute actor and critic losses
    policy_loss = torch.mean(
        -torch.sum(train_batch["mcts_policies"] * torch.log(priors), dim=-1))
    value_loss = torch.mean(torch.pow(values - train_batch["value_label"], 2))
    # compute total loss
    total_loss = (policy_loss + value_loss) / 2
    return total_loss, policy_loss, value_loss
Esempio n. 15
0
    def custom_loss(self, policy_loss, loss_inputs):
        """Calculates a custom loss on top of the given policy_loss(es).

        Args:
            policy_loss (List[TensorType]): The list of already calculated
                policy losses (as many as there are optimizers).
            loss_inputs (TensorStruct): Struct of np.ndarrays holding the
                entire train batch.

        Returns:
            List[TensorType]: The altered list of policy losses. In case the
                custom loss should have its own optimizer, make sure the
                returned list is one larger than the incoming policy_loss list.
                In case you simply want to mix in the custom loss into the
                already calculated policy losses, return a list of altered
                policy losses (as done in this example below).
        """
        # Get the next batch from our input files.
        batch = self.reader.next()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            torch.from_numpy(batch["obs"]).float(),
            self.obs_space,
            tensorlib="torch")
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        imitation_loss = torch.mean(
            -action_dist.logp(torch.from_numpy(batch["actions"])))
        self.imitation_loss_metric = imitation_loss.item()
        self.policy_loss_metric = np.mean([l.item() for l in policy_loss])

        # Add the imitation loss to each already calculated policy loss term.
        # Alternatively (if custom loss has its own optimizer):
        # return policy_loss + [10 * self.imitation_loss]
        return [loss_ + 10 * imitation_loss for loss_ in policy_loss]
Esempio n. 16
0
def new_ppo_surrogate_loss(policy, model, dist_class, train_batch):

    # zero out the loss elements where you weren't actually acting
    original_space = restore_original_dimensions(train_batch['obs'],
                                                 model.obs_space)
    is_active = original_space['is_active']

    # extract the ppo_surrogate_loss before the mean is taken
    ppo_custom_surrogate_loss(policy, model, dist_class, train_batch)
    pre_mean_loss = policy.loss_obj.pre_mean_loss

    # This mask combines both the valid mask and a check for when we were actually active in the env
    combined_mask = tf.math.logical_and(
        policy.loss_obj.valid_mask, tf.cast(tf.squeeze(is_active, -1),
                                            tf.bool))
    standard_loss = tf.reduce_mean(
        tf.boolean_mask(pre_mean_loss, combined_mask))

    return standard_loss
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.input_files)
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = TorchCategorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = torch.mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
Esempio n. 18
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        logits, _ = self._build_layers_v2({
            "obs": restore_original_dimensions(input_ops["obs"],
                                               self.obs_space)
        }, self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
Esempio n. 19
0
def actor_critic_loss(policy, model, _, train_batch):

    restored_obs = restore_original_dimensions(
            train_batch[SampleBatch.CUR_OBS], model.obs_space, "tf")

    flat_valid_actions_mask_t = tf.reshape(restored_obs["valid_actions_mask"], [-1, policy.model.num_outputs])

    restored_next_obs = restore_original_dimensions(
        train_batch[SampleBatch.NEXT_OBS], model.obs_space, "tf")

    model_logits_t, _ = model({
        "obs": train_batch[SampleBatch.CUR_OBS],
        "is_training": policy._get_is_training_placeholder(),
    }, [], None)

    model_logits_tp1, _ = model({
        "obs": train_batch[SampleBatch.NEXT_OBS],
        "is_training": policy._get_is_training_placeholder(),
    }, [], None)

    policy_t = tf.nn.softmax(model_logits_t, axis=-1)  # shape (batchsize, num_actions)
    log_pis_t = tf.log(policy_t + 1e-8)  # shape (batchsize, num_actions)

    policy_tp1 = tf.nn.softmax(model_logits_tp1, axis=-1)  # shape (batchsize, num_actions)
    log_pis_tp1 = tf.log(policy_tp1 + 1e-8)  # shape (batchsize, num_actions)

    log_alpha = model.log_alpha
    alpha = model.alpha

    # q network evaluation
    main_q_t = model.get_q_values(observations=restored_obs)  # shape (batchsize, num_actions)
    action_indices = tf.stack(
        [tf.cast(tf.range(tf.shape(main_q_t)[0]), tf.int64), train_batch[SampleBatch.ACTIONS]], axis=-1)
    main_q_t_selected = tf.gather_nd(main_q_t, action_indices)  # shape (batchsize,)

    if policy.config["twin_q"]:
        twin_q_t = model.get_twin_q_values(observations=restored_obs)  # shape (batchsize, num_actions)
        twin_q_t_selected = tf.gather_nd(twin_q_t, action_indices)  # shape (batchsize,)
        min_q_t = tf.math.minimum(main_q_t, twin_q_t)  # shape (batchsize, num_actions)
    else:
        min_q_t = main_q_t  # shape (batchsize, num_actions)

    # target q network evaluation
    main_q_targetnet_tp1 = policy.target_model.get_q_values(observations=restored_next_obs)  # shape (batchsize, num_actions)
    if policy.config["twin_q"]:
        twin_q_targetnet_tp1 = policy.target_model.get_twin_q_values(observations=restored_next_obs)  # shape (batchsize, num_actions)
        min_q_targetnet_tp1 = tf.math.minimum(main_q_targetnet_tp1, twin_q_targetnet_tp1)  # shape (batchsize, num_actions)
    else:
        min_q_targetnet_tp1 = main_q_targetnet_tp1  # shape (batchsize, num_actions)

    value_tp1 = tf.stop_gradient(tf.reduce_sum(policy_tp1 * (min_q_targetnet_tp1 - alpha * log_pis_tp1), axis=-1))  # shape (batchsize,)
    assert np.array_equal(np.asarray(value_tp1.get_shape().as_list()), [None]), f"shape is {np.asarray(value_tp1.get_shape().as_list())}"

    value_tp1_masked = (1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * value_tp1   # shape (batchsize,)

    assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1"

    # compute RHS of bellman equation
    q_t_target = train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * value_tp1_masked   # shape (batchsize,)
    assert np.array_equal(np.asarray(q_t_target.get_shape().as_list()), [None])

    q1_loss = 0.5 * tf.reduce_mean((main_q_t_selected - q_t_target)**2)
    if policy.config["twin_q"]:
        q2_loss = 0.5 * tf.reduce_mean((twin_q_t_selected - q_t_target) ** 2)

    # TODO use a baseline?
    baseline = 0.0
    value_t = tf.stop_gradient(tf.reduce_sum(policy_t * (min_q_t - alpha * log_pis_t), axis=-1, keep_dims=True))  # shape (batchsize, 1)
    # baseline = value_t

    actor_loss_per_batch_element = tf.reduce_sum(policy_t * tf.stop_gradient(alpha * log_pis_t - min_q_t + baseline), axis= -1)  # shape (batchsize,)
    actor_loss = tf.reduce_mean(actor_loss_per_batch_element)

    policy.max_entropy_target_proportion = tf.Variable(policy.config["max_entropy_target_proportion"], dtype=tf.float32, name="max_entropy_target_proportion")

    target_entropies = policy.max_entropy_target_proportion * tf.log(tf.reduce_sum(flat_valid_actions_mask_t, axis=-1, keep_dims=False)) if policy.config["target_entropy"] == "auto" else policy.config["target_entropy"]  # shape (batchsize,) if 'auto'
    assert np.array_equal(np.asarray(target_entropies.get_shape().as_list()), [None])

    policy.target_entropies = target_entropies

    pi_entropies = -tf.reduce_sum(policy_t * log_pis_t, axis=-1)
    # assert np.array_equal(np.asarray(pi_entropies.get_shape().as_list()), [None])
    policy.pi_entropies = pi_entropies
    alpha_backup = tf.stop_gradient(target_entropies - pi_entropies)  # shape (batchsize,)
    assert np.array_equal(np.asarray(alpha_backup.get_shape().as_list()), [None]), f"actual shape {alpha_backup.get_shape().as_list()}"
    alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup)

    # save for stats function
    policy.min_q_t = min_q_t
    # policy.td_error = td_error
    policy.actor_loss = actor_loss
    policy.q1_loss = q1_loss
    if policy.config["twin_q"]:
        policy.q2_loss = q2_loss

    policy.alpha_loss = alpha_loss

    # in a custom apply op we handle the losses separately, but return them
    # combined in one loss for now
    return actor_loss + q1_loss + (q2_loss if policy.config["twin_q"] else 0) + alpha_loss
Esempio n. 20
0
def actor_critic_loss(policy, model, _, train_batch):

    restored_obs = restore_original_dimensions(
        train_batch[SampleBatch.CUR_OBS], model.obs_space, "tf")

    flat_valid_actions_mask_t = tf.reshape(restored_obs["valid_actions_mask"],
                                           [-1, policy.model.num_outputs])

    restored_next_obs = restore_original_dimensions(
        train_batch[SampleBatch.NEXT_OBS], model.obs_space, "tf")

    zero_states = []
    i = 0
    while "state_in_{}".format(i) in train_batch:
        zero_states.append(tf.zeros_like(train_batch["state_in_{}".format(i)]))
        i += 1

    model_out_t, state_out_t = model(
        {
            "obs": train_batch[SampleBatch.CUR_OBS],
            "is_training": policy._get_is_training_placeholder(),
        }, zero_states, train_batch['seq_lens'])

    if state_out_t:
        max_seq_len = tf.reduce_max(train_batch["seq_lens"])
        valid_mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len)
        valid_mask = tf.reshape(valid_mask, [-1])
    else:
        valid_mask = tf.ones_like(train_batch[SampleBatch.CUR_OBS],
                                  dtype=tf.bool)

    def reduce_mean_valid(t):
        return tf.reduce_mean(tf.boolean_mask(t, valid_mask))

    model_out_tp1, _ = model(
        {
            "obs": train_batch[SampleBatch.NEXT_OBS],
            "is_training": policy._get_is_training_placeholder(),
        }, zero_states, train_batch['seq_lens'])

    target_model_out_tp1, _ = policy.target_model(
        {
            "obs": train_batch[SampleBatch.NEXT_OBS],
            "is_training": policy._get_is_training_placeholder(),
        }, zero_states, train_batch['seq_lens'])

    policy_probs_t = tf.nn.softmax(
        model.get_policy_logits(base_model_out=model_out_t),
        axis=-1)  # shape (batchsize, num_actions)
    log_pis_t = tf.log(policy_probs_t + 1e-8)  # shape (batchsize, num_actions)

    policy_probs_tp1 = tf.nn.softmax(
        model.get_policy_logits(base_model_out=model_out_tp1),
        axis=-1)  # shape (batchsize, num_actions)
    log_pis_tp1 = tf.log(policy_probs_tp1 +
                         1e-8)  # shape (batchsize, num_actions)

    log_alpha = model.log_alpha
    alpha = model.alpha

    # q network evaluation
    main_q_t = model.get_q_values(
        base_model_out=model_out_t)  # shape (batchsize, num_actions)
    action_indices = tf.stack([
        tf.cast(tf.range(tf.shape(main_q_t)[0]), tf.int64),
        train_batch[SampleBatch.ACTIONS]
    ],
                              axis=-1)
    main_q_t_selected = tf.gather_nd(main_q_t,
                                     action_indices)  # shape (batchsize,)

    if policy.config["twin_q"]:
        twin_q_t = model.get_twin_q_values(
            base_model_out=model_out_t)  # shape (batchsize, num_actions)
        twin_q_t_selected = tf.gather_nd(twin_q_t,
                                         action_indices)  # shape (batchsize,)
        min_q_t = tf.math.minimum(main_q_t,
                                  twin_q_t)  # shape (batchsize, num_actions)
    else:
        min_q_t = main_q_t  # shape (batchsize, num_actions)

    # target q network evaluation
    main_q_targetnet_tp1 = policy.target_model.get_q_values(
        base_model_out=target_model_out_tp1)  # shape (batchsize, num_actions)
    if policy.config["twin_q"]:
        twin_q_targetnet_tp1 = policy.target_model.get_twin_q_values(
            base_model_out=target_model_out_tp1
        )  # shape (batchsize, num_actions)
        min_q_targetnet_tp1 = tf.math.minimum(
            main_q_targetnet_tp1,
            twin_q_targetnet_tp1)  # shape (batchsize, num_actions)
    else:
        min_q_targetnet_tp1 = main_q_targetnet_tp1  # shape (batchsize, num_actions)

    q_value_tp1_per_action = (min_q_targetnet_tp1 - alpha * log_pis_tp1)
    value_tp1 = tf.stop_gradient(
        tf.reduce_sum(policy_probs_tp1 * q_value_tp1_per_action,
                      axis=-1))  # shape (batchsize,)
    assert np.array_equal(
        np.asarray(value_tp1.get_shape().as_list()),
        [None]), f"shape is {np.asarray(value_tp1.get_shape().as_list())}"

    value_tp1_masked = (1.0 -
                        tf.cast(train_batch[SampleBatch.DONES],
                                tf.float32)) * value_tp1  # shape (batchsize,)

    assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1"

    # compute RHS of bellman equation
    q_t_target = train_batch[SampleBatch.REWARDS] + policy.config[
        "gamma"] * value_tp1_masked  # shape (batchsize,)
    assert np.array_equal(np.asarray(q_t_target.get_shape().as_list()), [None])

    q1_loss = 0.5 * reduce_mean_valid((main_q_t_selected - q_t_target)**2)
    q2_loss = 0.5 * reduce_mean_valid((twin_q_t_selected - q_t_target)**2)

    # TODO use a baseline?
    baseline = 0.0
    value_t = tf.stop_gradient(
        tf.reduce_sum(policy_probs_t * (min_q_t - alpha * log_pis_t),
                      axis=-1,
                      keep_dims=True))  # shape (batchsize, 1)
    # baseline = value_t

    actor_loss_per_batch_element = tf.reduce_sum(
        policy_probs_t *
        tf.stop_gradient(alpha * log_pis_t - min_q_t + baseline),
        axis=-1)  # shape (batchsize,)
    actor_loss = reduce_mean_valid(actor_loss_per_batch_element)

    target_entropies = policy.config["max_entropy_target_proportion"] * tf.log(
        1e-8 +
        tf.reduce_sum(flat_valid_actions_mask_t, axis=-1, keep_dims=False)
    ) if policy.config["target_entropy"] == "auto" else policy.config[
        "target_entropy"]  # shape (batchsize,) if 'auto'
    assert np.array_equal(np.asarray(target_entropies.get_shape().as_list()),
                          [None])

    target_entropies = tf.debugging.check_numerics(
        target_entropies, f"nan found in target_entropies", name=None)

    policy.target_entropies = target_entropies

    # assert False, f"target entropy is {target_entropy}, action_space.n is {policy.action_space.n}"

    pi_entropies = -tf.reduce_sum(policy_probs_t * log_pis_t, axis=-1)

    pi_entropies = tf.debugging.check_numerics(pi_entropies,
                                               f"nan found in pi_entropies",
                                               name=None)

    # assert np.array_equal(np.asarray(pi_entropies.get_shape().as_list()), [None])
    policy.pi_entropies = pi_entropies
    alpha_backup = tf.stop_gradient(target_entropies -
                                    pi_entropies)  # shape (batchsize,)
    assert np.array_equal(
        np.asarray(alpha_backup.get_shape().as_list()),
        [None]), f"actual shape {alpha_backup.get_shape().as_list()}"
    alpha_loss = -reduce_mean_valid(log_alpha * alpha_backup)

    # save for stats function
    policy.min_q_t = min_q_t
    # policy.td_error = td_error
    policy.actor_loss = actor_loss
    policy.q1_loss = q1_loss
    policy.q2_loss = q2_loss

    policy.alpha_loss = alpha_loss

    total_loss = actor_loss + q1_loss + q2_loss + alpha_loss

    total_loss = tf.debugging.check_numerics(total_loss,
                                             f"nan found in total_loss",
                                             name=None)

    # in a custom apply op we handle the losses separately, but return them
    # combined in one loss for now
    return total_loss