def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        obs = tf.cast(input_ops["obs"], tf.float32)
        logits, _ = self._build_layers_v2(
            {"obs": restore_original_dimensions(obs, self.obs_space)},
            self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits, self.options)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        total_loss = policy_loss + self.options["custom_options"]["lambda1"]\
            * policy_loss + self.options["custom_options"]["lambda2"]\
            * self.imitation_loss
        return total_loss
Esempio n. 2
0
def get_log_likelihood(policy, q_model, actions, input_dict, obs_space,
                       action_space, config):
    # Action Q network.
    q_vals = _compute_q_values(policy, q_model,
                               input_dict[SampleBatch.CUR_OBS], obs_space,
                               action_space)
    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals
    action_dist = Categorical(q_vals, q_model)
    return action_dist.logp(actions)
Esempio n. 3
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(
            self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops(
            self.model_config["custom_model_config"].get("expert_size", 1))

        # define a secondary loss by building a graph copy with weight sharing
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        # print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        self.policy_loss = policy_loss
        (action_scores, model_logits,
         dist) = self.get_q_value_distributions(logits)
        model_logits = tf.squeeze(model_logits)
        action_dist = Categorical(model_logits, self.model_config)

        expert_logits = tf.cast(input_ops["actions"], tf.int32)
        expert_action = tf.math.argmax(expert_logits)
        expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs)
        model_action = action_dist.deterministic_sample()
        model_action_one_hot = tf.one_hot(model_action, self.num_outputs)
        model_expert = model_action_one_hot * expert_action_one_hot
        imitation_loss = 0
        loss_type = self.model_config["custom_model_config"].get("loss", "ce")
        if loss_type == "ce":
            imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits))
        elif loss_type == "kl":
            expert_dist = Categorical(tf.one_hot(expert_logits, \
                                                 self.num_outputs), self.model_config)
            imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist))
        elif loss_type == "dqfd":
            max_value = float("-inf")
            Q_select = model_logits  # TODO: difference in action_scores,dist and logits
            for a in range(self.num_outputs):
                max_value = tf.maximum(
                    Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32),
                    max_value)
            imitation_loss = tf.reduce_mean(
                1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)]))

        self.imitation_loss = imitation_loss
        total_loss = self.model_config["custom_model_config"]["lambda1"] * policy_loss \
                     + self.model_config["custom_model_config"]["lambda2"] \
                     * self.imitation_loss
        return total_loss
Esempio n. 4
0
    def _postprocess_helper_tf(self, obs, next_obs, actions):
        with (tf.GradientTape()
              if self.framework != "tf" else NullContextManager()) as tape:
            # Push both observations through feature net to get both phis.
            phis, _ = self.model._curiosity_feature_net({
                SampleBatch.OBS: tf.concat([obs, next_obs], axis=0)
            })
            phi, next_phi = tf.split(phis, 2)

            # Predict next phi with forward model.
            predicted_next_phi = self.model._curiosity_forward_fcnet(
                tf.concat(
                    [phi, tf_one_hot(actions, self.action_space)], axis=-1))

            # Forward loss term (predicted phi', given phi and action vs
            # actually observed phi').
            forward_l2_norm_sqared = 0.5 * tf.reduce_sum(
                tf.square(predicted_next_phi - next_phi), axis=-1)
            forward_loss = tf.reduce_mean(forward_l2_norm_sqared)

            # Inverse loss term (prediced action that led from phi to phi' vs
            # actual action taken).
            phi_cat_next_phi = tf.concat([phi, next_phi], axis=-1)
            dist_inputs = self.model._curiosity_inverse_fcnet(phi_cat_next_phi)
            action_dist = Categorical(dist_inputs, self.model) if \
                isinstance(self.action_space, Discrete) else \
                MultiCategorical(
                    dist_inputs, self.model, self.action_space.nvec)
            # Neg log(p); p=probability of observed action given the inverse-NN
            # predicted action distribution.
            inverse_loss = -action_dist.logp(actions)
            inverse_loss = tf.reduce_mean(inverse_loss)

            # Calculate the ICM loss.
            loss = (1.0 - self.beta) * inverse_loss + self.beta * forward_loss

        # Step the optimizer.
        if self.framework != "tf":
            grads = tape.gradient(loss, self._optimizer_var_list)
            grads_and_vars = [(g, v)
                              for g, v in zip(grads, self._optimizer_var_list)
                              if g is not None]
            update_op = self._optimizer.apply_gradients(grads_and_vars)
        else:
            update_op = self._optimizer.minimize(
                loss, var_list=self._optimizer_var_list)

        # Return the squared l2 norm and the optimizer update op.
        return forward_l2_norm_sqared, update_op
Esempio n. 5
0
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space
        )
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = Categorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
    def _train(self):
        import tensorflow as tf
        policy = self.get_policy()
        steps = 0
        n_episodes = 1
        for _ in range(n_episodes):
            env = self.env._env.rail_env
            obs = self.env.reset()
            num_outputs = env.action_space[0]
            n_agents = env.get_num_agents()
            dispatcher = CellGraphDispatcher(env)

            # TODO : Update max_steps as per latest version
            # https://gitlab.aicrowd.com/flatland/flatland-examples/blob/master/reinforcement_learning/multi_agent_training.py
            # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) - 1
            max_steps = int(4 * 2 * (20 + env.height + env.width))
            episode_steps = 0
            episode_max_steps = 0
            episode_num_agents = 0
            episode_score = 0
            episode_done_agents = 0
            done = {}
            done["__all__"] = False

            # TODO: Support for batch update
            # batch_size = 2
            # logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)

            for step in range(max_steps):
                action_dict = dispatcher.step(env._elapsed_steps)

                with tf.GradientTape() as tape:
                    imitation_loss = 0
                    active_agents = 0
                    for a in range(n_agents):
                        if not done.get(a) and obs.get(a) is not None:
                            active_agents += 1
                            expert_action = action_dict[a].value
                            input_dict = {"obs": np.expand_dims(obs[a], 0)}
                            input_dict['obs_flat'] = input_dict['obs']
                            logits, _ = policy.model.forward(
                                input_dict, [], None)
                            model_logits = tf.squeeze(logits)
                            expert_logits = tf.cast(expert_action, tf.int32)

                            action_dist = Categorical(
                                logits, policy.model.model_config)

                            imitation_loss += tf.reduce_mean(-action_dist.logp(
                                tf.expand_dims(expert_logits, 0)))
                    imitation_loss = imitation_loss / max(active_agents, 1)

                gradients = tape.gradient(imitation_loss,
                                          policy.model.trainable_variables())

                self.workers.local_worker().apply_gradients(gradients)
                weights = ray.put(self.workers.local_worker().get_weights())
                # print(self.workers.local_worker().get_weights()['default_policy'][0][:4])
                for e in self.workers.remote_workers():
                    e.set_weights.remote(weights)

                obs, all_rewards, done, info = self.env.step(action_dict)
                steps += 1

                for agent, agent_info in info.items():
                    if agent_info["agent_done"]:
                        episode_done_agents += 1

                if done["__all__"]:
                    for agent, agent_info in info.items():
                        if episode_max_steps == 0:
                            episode_max_steps = agent_info["max_episode_steps"]
                            episode_num_agents = agent_info["num_agents"]
                        episode_steps = max(episode_steps,
                                            agent_info["agent_step"])
                        episode_score += agent_info["agent_score"]
                    print(float(episode_done_agents) / episode_num_agents)
                    break

        norm_factor = 1.0 / (episode_max_steps * episode_num_agents)

        result = {
            "expert_episode_reward_mean": episode_score,
            "episode_reward_mean": episode_score,
            "expert_episode_completion_mean":
            float(episode_done_agents) / episode_num_agents,
            "expert_episode_score_normalized": episode_score * norm_factor,
            "episodes_this_iter": n_episodes,
            "timesteps_this_iter": steps,
        }

        # Code taken from _train method of trainer_template.py - TODO: Not working
        # res = self.collect_metrics()
        # res = {}
        # res.update(
        #     optimizer_steps_this_iter=steps,
        #     episode_reward_mean=episode_score,
        #     info=res.get("info", {}))
        # res.update(expert_scores = result)

        return result