Esempio n. 1
0
    def __init__(self, q_values, observations, num_actions, cur_epsilon,
                 softmax, softmax_temp, model_config):
        if softmax:
            action_dist = Categorical(q_values / softmax_temp)
            self.action = action_dist.sample()
            self.action_prob = tf.exp(action_dist.sampled_action_logp())
            return

        deterministic_actions = tf.argmax(q_values, axis=1)
        batch_size = tf.shape(observations)[0]

        # Special case masked out actions (q_value ~= -inf) so that we don't
        # even consider them for exploration.
        random_valid_action_logits = tf.where(
            tf.equal(q_values, tf.float32.min),
            tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
        random_actions = tf.squeeze(
            tf.multinomial(random_valid_action_logits, 1), axis=1)

        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1,
            dtype=tf.float32) < cur_epsilon
        self.action = tf.where(chose_random, random_actions,
                               deterministic_actions)
        self.action_prob = None
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(self.options["custom_options"]["input_files"])
        input_ops = reader.tf_input_ops()

        # define a secondary loss by building a graph copy with weight sharing
        obs = tf.cast(input_ops["obs"], tf.float32)
        logits, _ = self._build_layers_v2(
            {"obs": restore_original_dimensions(obs, self.obs_space)},
            self.num_outputs, self.options)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        action_dist = Categorical(logits, self.options)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(
            -action_dist.logp(input_ops["actions"]))
        total_loss = policy_loss + self.options["custom_options"]["lambda1"]\
            * policy_loss + self.options["custom_options"]["lambda2"]\
            * self.imitation_loss
        return total_loss
Esempio n. 3
0
def sample_action_from_q_network(policy, q_model, input_dict, obs_space,
                                 action_space, config):
    # Action Q network.
    q_values, q_logits, q_dist = _compute_q_values(
        policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space,
        action_space)
    policy.q_values = q_values
    policy.q_func_vars = q_model.variables()

    # Noise vars for Q network except for layer normalization vars
    if config["parameter_noise"]:
        _build_parameter_noise(
            policy,
            [var for var in policy.q_func_vars if "LayerNorm" not in var.name])
        policy.action_probs = tf.nn.softmax(policy.q_values)

    # TODO(sven): Move soft_q logic to different Exploration child-component.
    action_log_prob = None
    if config["soft_q"]:
        action_dist = Categorical(q_values / config["softmax_temp"])
        policy.output_actions = action_dist.sample()
        action_log_prob = action_dist.sampled_action_logp()
        policy.action_prob = tf.exp(action_log_prob)
    else:
        policy.output_actions = tf.argmax(q_values, axis=1)
        policy.action_prob = None
    return policy.output_actions, action_log_prob
Esempio n. 4
0
def get_log_likelihood(policy, q_model, actions, input_dict, obs_space,
                       action_space, config):
    # Action Q network.
    q_vals = _compute_q_values(policy, q_model,
                               input_dict[SampleBatch.CUR_OBS], obs_space,
                               action_space)
    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals
    action_dist = Categorical(q_vals, q_model)
    return action_dist.logp(actions)
Esempio n. 5
0
    def _postprocess_helper_tf(self, obs, next_obs, actions):
        with (tf.GradientTape()
              if self.framework != "tf" else NullContextManager()) as tape:
            # Push both observations through feature net to get both phis.
            phis, _ = self.model._curiosity_feature_net({
                SampleBatch.OBS: tf.concat([obs, next_obs], axis=0)
            })
            phi, next_phi = tf.split(phis, 2)

            # Predict next phi with forward model.
            predicted_next_phi = self.model._curiosity_forward_fcnet(
                tf.concat(
                    [phi, tf_one_hot(actions, self.action_space)], axis=-1))

            # Forward loss term (predicted phi', given phi and action vs
            # actually observed phi').
            forward_l2_norm_sqared = 0.5 * tf.reduce_sum(
                tf.square(predicted_next_phi - next_phi), axis=-1)
            forward_loss = tf.reduce_mean(forward_l2_norm_sqared)

            # Inverse loss term (prediced action that led from phi to phi' vs
            # actual action taken).
            phi_cat_next_phi = tf.concat([phi, next_phi], axis=-1)
            dist_inputs = self.model._curiosity_inverse_fcnet(phi_cat_next_phi)
            action_dist = Categorical(dist_inputs, self.model) if \
                isinstance(self.action_space, Discrete) else \
                MultiCategorical(
                    dist_inputs, self.model, self.action_space.nvec)
            # Neg log(p); p=probability of observed action given the inverse-NN
            # predicted action distribution.
            inverse_loss = -action_dist.logp(actions)
            inverse_loss = tf.reduce_mean(inverse_loss)

            # Calculate the ICM loss.
            loss = (1.0 - self.beta) * inverse_loss + self.beta * forward_loss

        # Step the optimizer.
        if self.framework != "tf":
            grads = tape.gradient(loss, self._optimizer_var_list)
            grads_and_vars = [(g, v)
                              for g, v in zip(grads, self._optimizer_var_list)
                              if g is not None]
            update_op = self._optimizer.apply_gradients(grads_and_vars)
        else:
            update_op = self._optimizer.minimize(
                loss, var_list=self._optimizer_var_list)

        # Return the squared l2 norm and the optimizer update op.
        return forward_l2_norm_sqared, update_op
Esempio n. 6
0
 def testCategorical(self):
     num_samples = 100000
     logits = tf.placeholder(tf.float32, shape=(None, 10))
     z = 8 * (np.random.rand(10) - 0.5)
     data = np.tile(z, (num_samples, 1))
     c = Categorical(logits, {})  # dummy config dict
     sample_op = c.sample()
     sess = tf.Session()
     sess.run(tf.global_variables_initializer())
     samples = sess.run(sample_op, feed_dict={logits: data})
     counts = np.zeros(10)
     for sample in samples:
         counts[sample] += 1.0
     probs = np.exp(z) / np.sum(np.exp(z))
     self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
Esempio n. 7
0
    def custom_loss(self, policy_loss, loss_inputs):
        # Create a new input reader per worker.
        reader = JsonReader(self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops()

        # Define a secondary loss by building a graph copy with weight sharing.
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space
        )
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # Compute the IL loss.
        action_dist = Categorical(logits, self.model_config)
        self.policy_loss = policy_loss
        self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"]))
        return policy_loss + 10 * self.imitation_loss
Esempio n. 8
0
    def custom_loss(self, policy_loss, loss_inputs):
        # create a new input reader per worker
        reader = JsonReader(
            self.model_config["custom_model_config"]["input_files"])
        input_ops = reader.tf_input_ops(
            self.model_config["custom_model_config"].get("expert_size", 1))

        # define a secondary loss by building a graph copy with weight sharing
        obs = restore_original_dimensions(
            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
        logits, _ = self.forward({"obs": obs}, [], None)

        # You can also add self-supervised losses easily by referencing tensors
        # created during _build_layers_v2(). For example, an autoencoder-style
        # loss can be added as follows:
        # ae_loss = squared_diff(
        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
        # print("FYI: You can also use these tensors: {}, ".format(loss_inputs))

        # compute the IL loss
        self.policy_loss = policy_loss
        (action_scores, model_logits,
         dist) = self.get_q_value_distributions(logits)
        model_logits = tf.squeeze(model_logits)
        action_dist = Categorical(model_logits, self.model_config)

        expert_logits = tf.cast(input_ops["actions"], tf.int32)
        expert_action = tf.math.argmax(expert_logits)
        expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs)
        model_action = action_dist.deterministic_sample()
        model_action_one_hot = tf.one_hot(model_action, self.num_outputs)
        model_expert = model_action_one_hot * expert_action_one_hot
        imitation_loss = 0
        loss_type = self.model_config["custom_model_config"].get("loss", "ce")
        if loss_type == "ce":
            imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits))
        elif loss_type == "kl":
            expert_dist = Categorical(tf.one_hot(expert_logits, \
                                                 self.num_outputs), self.model_config)
            imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist))
        elif loss_type == "dqfd":
            max_value = float("-inf")
            Q_select = model_logits  # TODO: difference in action_scores,dist and logits
            for a in range(self.num_outputs):
                max_value = tf.maximum(
                    Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32),
                    max_value)
            imitation_loss = tf.reduce_mean(
                1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)]))

        self.imitation_loss = imitation_loss
        total_loss = self.model_config["custom_model_config"]["lambda1"] * policy_loss \
                     + self.model_config["custom_model_config"]["lambda2"] \
                     * self.imitation_loss
        return total_loss
Esempio n. 9
0
 def xyz_compute_actions(
     self,
     *,
     input_dict,
     explore=True,
     timestep=None,
     episodes=None,
     is_training=False,
     **kwargs,
 ) -> Tuple[TensorStructType, List[TensorType], Dict[str,
                                                     TensorStructType]]:
     if timestep is None:
         timestep = self.global_timestep
     # Compute the Q-values for each possible action, using our Q-value network.
     q_vals = self._compute_q_values(self.model,
                                     input_dict[SampleBatch.OBS],
                                     is_training=is_training)
     # Use a Categorical distribution for the exploration component.
     # This way, it may either sample storchastically (e.g. when using SoftQ)
     # or deterministically/greedily (e.g. when using EpsilonGreedy).
     distribution = Categorical(q_vals, self.model)
     # Call the exploration component's `get_exploration_action` method to
     # explore, if necessary.
     actions, logp = self.exploration.get_exploration_action(
         action_distribution=distribution,
         timestep=timestep,
         explore=explore)
     # Return (exploration) actions, state_outs (empty list), and extra outs.
     return (
         actions,
         [],
         {
             "q_values": q_vals,
             SampleBatch.ACTION_LOGP: logp,
             SampleBatch.ACTION_PROB: tf.exp(logp),
             SampleBatch.ACTION_DIST_INPUTS: q_vals,
         },
     )
Esempio n. 10
0
 def logp(self, actions):
     a1, a2 = actions[:, 0], actions[:, 1]
     a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
     a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec])
     return (Categorical(a1_logits).logp(a1) +
             Categorical(a2_logits).logp(a2))
Esempio n. 11
0
 def _a2_distribution(self, a1):
     a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
     _, a2_logits = self.model.action_model([self.inputs, a1_vec])
     a2_dist = Categorical(a2_logits)
     return a2_dist
Esempio n. 12
0
 def _a1_distribution(self):
     BATCH = tf.shape(self.inputs)[0]
     a1_logits, _ = self.model.action_model(
         [self.inputs, tf.zeros((BATCH, 1))])
     a1_dist = Categorical(a1_logits)
     return a1_dist
Esempio n. 13
0
    def test_pg_loss_functions(self):
        """Tests the PG loss function math."""
        config = pg.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
        config["eager"] = True
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"

        # Fake CartPole episode of n time steps.
        train_batch = {
            SampleBatch.CUR_OBS:
            np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
                      [0.9, 1.0, 1.1, 1.2]]),
            SampleBatch.ACTIONS:
            np.array([0, 1, 1]),
            SampleBatch.REWARDS:
            np.array([1.0, 1.0, 1.0]),
            SampleBatch.DONES:
            np.array([False, False, True])
        }

        # tf.
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        vars = policy.model.trainable_variables()

        # Post-process (calculate simple (non-GAE) advantages) and attach to
        # train_batch dict.
        # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
        # [2.9701, 1.99, 1.0]
        train_batch = pg.post_process_advantages(policy, train_batch)
        # Check Advantage values.
        check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])

        # Actual loss results.
        results = pg.pg_tf_loss(policy,
                                policy.model,
                                dist_class=Categorical,
                                train_batch=train_batch)

        # Calculate expected results.
        expected_logits = fc(
            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
               vars[1].numpy()), vars[2].numpy(), vars[3].numpy())
        expected_logp = Categorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp * train_batch[Postprocessing.ADVANTAGES])
        check(results.numpy(), expected_loss, decimals=4)

        # Torch.
        config["use_pytorch"] = True
        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
        policy = trainer.get_policy()
        train_batch = policy._lazy_tensor_dict(train_batch)
        results = pg.pg_torch_loss(policy,
                                   policy.model,
                                   dist_class=TorchCategorical,
                                   train_batch=train_batch)
        expected_logits = policy.model.last_output()
        expected_logp = TorchCategorical(expected_logits, policy.model).logp(
            train_batch[SampleBatch.ACTIONS])
        expected_loss = -np.mean(
            expected_logp.detach().numpy() *
            train_batch[Postprocessing.ADVANTAGES].numpy())
        check(results.detach().numpy(), expected_loss, decimals=4)
Esempio n. 14
0
    def _train(self):
        import tensorflow as tf
        policy = self.get_policy()
        steps = 0
        n_episodes = 1
        for _ in range(n_episodes):
            env = self.env._env.rail_env
            obs = self.env.reset()
            num_outputs = env.action_space[0]
            n_agents = env.get_num_agents()
            dispatcher = CellGraphDispatcher(env)

            # TODO : Update max_steps as per latest version
            # https://gitlab.aicrowd.com/flatland/flatland-examples/blob/master/reinforcement_learning/multi_agent_training.py
            # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) - 1
            max_steps = int(4 * 2 * (20 + env.height + env.width))
            episode_steps = 0
            episode_max_steps = 0
            episode_num_agents = 0
            episode_score = 0
            episode_done_agents = 0
            done = {}
            done["__all__"] = False

            # TODO: Support for batch update
            # batch_size = 2
            # logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None)

            for step in range(max_steps):
                action_dict = dispatcher.step(env._elapsed_steps)

                with tf.GradientTape() as tape:
                    imitation_loss = 0
                    active_agents = 0
                    for a in range(n_agents):
                        if not done.get(a) and obs.get(a) is not None:
                            active_agents += 1
                            expert_action = action_dict[a].value
                            input_dict = {"obs": np.expand_dims(obs[a], 0)}
                            input_dict['obs_flat'] = input_dict['obs']
                            logits, _ = policy.model.forward(
                                input_dict, [], None)
                            model_logits = tf.squeeze(logits)
                            expert_logits = tf.cast(expert_action, tf.int32)

                            action_dist = Categorical(
                                logits, policy.model.model_config)

                            imitation_loss += tf.reduce_mean(-action_dist.logp(
                                tf.expand_dims(expert_logits, 0)))
                    imitation_loss = imitation_loss / max(active_agents, 1)

                gradients = tape.gradient(imitation_loss,
                                          policy.model.trainable_variables())

                self.workers.local_worker().apply_gradients(gradients)
                weights = ray.put(self.workers.local_worker().get_weights())
                # print(self.workers.local_worker().get_weights()['default_policy'][0][:4])
                for e in self.workers.remote_workers():
                    e.set_weights.remote(weights)

                obs, all_rewards, done, info = self.env.step(action_dict)
                steps += 1

                for agent, agent_info in info.items():
                    if agent_info["agent_done"]:
                        episode_done_agents += 1

                if done["__all__"]:
                    for agent, agent_info in info.items():
                        if episode_max_steps == 0:
                            episode_max_steps = agent_info["max_episode_steps"]
                            episode_num_agents = agent_info["num_agents"]
                        episode_steps = max(episode_steps,
                                            agent_info["agent_step"])
                        episode_score += agent_info["agent_score"]
                    print(float(episode_done_agents) / episode_num_agents)
                    break

        norm_factor = 1.0 / (episode_max_steps * episode_num_agents)

        result = {
            "expert_episode_reward_mean": episode_score,
            "episode_reward_mean": episode_score,
            "expert_episode_completion_mean":
            float(episode_done_agents) / episode_num_agents,
            "expert_episode_score_normalized": episode_score * norm_factor,
            "episodes_this_iter": n_episodes,
            "timesteps_this_iter": steps,
        }

        # Code taken from _train method of trainer_template.py - TODO: Not working
        # res = self.collect_metrics()
        # res = {}
        # res.update(
        #     optimizer_steps_this_iter=steps,
        #     episode_reward_mean=episode_score,
        #     info=res.get("info", {}))
        # res.update(expert_scores = result)

        return result
Esempio n. 15
0
 def _actions_distribution(self):
     a_dists = list(Categorical(logit) for logit in self.model.logits)
     return a_dists