Example #1
0
    def estimate(
        self,
        batch: SampleBatchType,
    ) -> OffPolicyEstimate:
        self.check_can_estimate_for(batch)
        estimates = []
        # Split data into train and test batches
        for train_episodes, test_episodes in train_test_split(
                batch,
                self.train_test_split_val,
                self.k,
        ):

            # Train Q-function
            if train_episodes:
                # Reinitialize model
                self.model.reset()
                train_batch = SampleBatch.concat_samples(train_episodes)
                losses = self.train(train_batch)
                self.losses.append(losses)

            # Calculate doubly robust OPE estimates
            for episode in test_episodes:
                rewards, old_prob = episode["rewards"], episode["action_prob"]
                new_prob = np.exp(self.action_log_likelihood(episode))

                v_old = 0.0
                v_new = 0.0
                q_values = self.model.estimate_q(episode[SampleBatch.OBS],
                                                 episode[SampleBatch.ACTIONS])
                q_values = convert_to_numpy(q_values)

                all_actions = np.zeros(
                    [episode.count, self.policy.action_space.n])
                all_actions[:] = np.arange(self.policy.action_space.n)
                # Two transposes required for torch.distributions to work
                tmp_episode = episode.copy()
                tmp_episode[SampleBatch.ACTIONS] = all_actions.T
                action_probs = np.exp(
                    self.action_log_likelihood(tmp_episode)).T
                v_values = self.model.estimate_v(episode[SampleBatch.OBS],
                                                 action_probs)
                v_values = convert_to_numpy(v_values)

                for t in reversed(range(episode.count)):
                    v_old = rewards[t] + self.gamma * v_old
                    v_new = v_values[t] + (new_prob[t] / old_prob[t]) * (
                        rewards[t] + self.gamma * v_new - q_values[t])
                v_new = v_new.item()

                estimates.append(
                    OffPolicyEstimate(
                        self.name,
                        {
                            "v_old": v_old,
                            "v_new": v_new,
                            "v_gain": v_new / max(1e-8, v_old),
                        },
                    ))
        return estimates
Example #2
0
 def on_policy_output(self, env_id: str, agent_id: str,
                      output: PolicyOutputType):
     # Buffer latest output states for next input __call__.
     action, states, _ = output
     agent_state = self._states[env_id][agent_id]
     agent_state.action = convert_to_numpy(action)
     agent_state.states = convert_to_numpy(states)
Example #3
0
 def get_state(self, sess: Optional["tf.Session"] = None):
     if sess:
         return sess.run(self._tf_state_op)
     eps = self.epsilon_schedule(self.last_timestep)
     return {
         "cur_epsilon": convert_to_numpy(eps)
         if self.framework != "tf" else eps,
         "last_timestep": convert_to_numpy(self.last_timestep)
         if self.framework != "tf" else self.last_timestep,
     }
Example #4
0
    def estimate(self, batch: SampleBatchType) -> Dict[str, Any]:
        """Compute off-policy estimates.

        Args:
            batch: The SampleBatch to run off-policy estimation on

        Returns:
            A dict consists of the following metrics:
            - v_behavior: The discounted return averaged over episodes in the batch
            - v_behavior_std: The standard deviation corresponding to v_behavior
            - v_target: The estimated discounted return for `self.policy`,
            averaged over episodes in the batch
            - v_target_std: The standard deviation corresponding to v_target
            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
            - v_gain_std: The standard deviation corresponding to v_gain
        """
        batch = self.convert_ma_batch_to_sample_batch(batch)
        self.check_action_prob_in_batch(batch)
        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
        # Calculate doubly robust OPE estimates
        for episode in batch.split_by_episode():
            rewards, old_prob = episode["rewards"], episode["action_prob"]
            log_likelihoods = compute_log_likelihoods_from_input_dict(
                self.policy, episode
            )
            new_prob = np.exp(convert_to_numpy(log_likelihoods))

            v_behavior = 0.0
            v_target = 0.0
            q_values = self.model.estimate_q(episode)
            q_values = convert_to_numpy(q_values)
            v_values = self.model.estimate_v(episode)
            v_values = convert_to_numpy(v_values)
            assert q_values.shape == v_values.shape == (episode.count,)

            for t in reversed(range(episode.count)):
                v_behavior = rewards[t] + self.gamma * v_behavior
                v_target = v_values[t] + (new_prob[t] / old_prob[t]) * (
                    rewards[t] + self.gamma * v_target - q_values[t]
                )
            v_target = v_target.item()

            estimates["v_behavior"].append(v_behavior)
            estimates["v_target"].append(v_target)
            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
        estimates["v_target_std"] = np.std(estimates["v_target"])
        estimates["v_target"] = np.mean(estimates["v_target"])
        estimates["v_gain_std"] = np.std(estimates["v_gain"])
        estimates["v_gain"] = np.mean(estimates["v_gain"])
        return estimates
Example #5
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     if self.config["worker_index"]:
         return convert_to_numpy({"worker_loss": self.loss_obj.loss})
     else:
         return convert_to_numpy({
             "cur_kl_coeff": self.kl_coeff_val,
             "cur_lr": self.cur_lr,
             "total_loss": self.loss_obj.loss,
             "policy_loss": self.loss_obj.mean_policy_loss,
             "vf_loss": self.loss_obj.mean_vf_loss,
             "kl_loss": self.loss_obj.mean_kl_loss,
             "inner_kl": self.loss_obj.mean_inner_kl,
             "entropy": self.loss_obj.mean_entropy,
         })
Example #6
0
def postprocess_nstep_and_prio(
    policy: Policy, batch: SampleBatch, other_agent=None, episode=None
) -> SampleBatch:
    # N-step Q adjustments.
    if policy.config["n_step"] > 1:
        adjust_nstep(policy.config["n_step"], policy.config["gamma"], batch)

    # Create dummy prio-weights (1.0) in case we don't have any in
    # the batch.
    if PRIO_WEIGHTS not in batch:
        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])

    # Prioritize on the worker side.
    if batch.count > 0 and policy.config["replay_buffer_config"].get(
        "worker_side_prioritization", False
    ):
        td_errors = policy.compute_td_error(
            batch[SampleBatch.OBS],
            batch[SampleBatch.ACTIONS],
            batch[SampleBatch.REWARDS],
            batch[SampleBatch.NEXT_OBS],
            batch[SampleBatch.DONES],
            batch[PRIO_WEIGHTS],
        )
        # Retain compatibility with old-style Replay args
        epsilon = policy.config.get("replay_buffer_config", {}).get(
            "prioritized_replay_eps"
        ) or policy.config.get("prioritized_replay_eps")
        if epsilon is None:
            raise ValueError("prioritized_replay_eps not defined in config.")

        new_priorities = np.abs(convert_to_numpy(td_errors)) + epsilon
        batch[PRIO_WEIGHTS] = new_priorities

    return batch
Example #7
0
def postprocess_nstep_and_prio(policy: Policy,
                               batch: SampleBatch,
                               other_agent=None,
                               episode=None) -> SampleBatch:
    # N-step Q adjustments.
    if policy.config["n_step"] > 1:
        _adjust_nstep(policy.config["n_step"], policy.config["gamma"],
                      batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
                      batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
                      batch[SampleBatch.DONES])

    if PRIO_WEIGHTS not in batch:
        batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])

    # Prioritize on the worker side.
    if batch.count > 0 and policy.config["worker_side_prioritization"]:
        td_errors = policy.compute_td_error(batch[SampleBatch.CUR_OBS],
                                            batch[SampleBatch.ACTIONS],
                                            batch[SampleBatch.REWARDS],
                                            batch[SampleBatch.NEXT_OBS],
                                            batch[SampleBatch.DONES],
                                            batch[PRIO_WEIGHTS])
        new_priorities = (np.abs(convert_to_numpy(td_errors)) +
                          policy.config["prioritized_replay_eps"])
        batch[PRIO_WEIGHTS] = new_priorities

    return batch
Example #8
0
 def extra_compute_grad_fetches(self):
     if extra_learn_fetches_fn:
         fetches = convert_to_numpy(extra_learn_fetches_fn(self))
         # Auto-add empty learner stats dict if needed.
         return dict({LEARNER_STATS_KEY: {}}, **fetches)
     else:
         return parent_cls.extra_compute_grad_fetches(self)
Example #9
0
def centralized_critic_postprocessing(
    policy, sample_batch, other_agent_batches=None, episode=None
):
    pytorch = policy.config["framework"] == "torch"
    if (pytorch and hasattr(policy, "compute_central_vf")) or (
        not pytorch and policy.loss_initialized()
    ):
        assert other_agent_batches is not None
        [(_, opponent_batch)] = list(other_agent_batches.values())

        # also record the opponent obs and actions in the trajectory
        sample_batch[OPPONENT_OBS] = opponent_batch[SampleBatch.CUR_OBS]
        sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]

        # overwrite default VF prediction with the central VF
        if args.framework == "torch":
            sample_batch[SampleBatch.VF_PREDS] = (
                policy.compute_central_vf(
                    convert_to_torch_tensor(
                        sample_batch[SampleBatch.CUR_OBS], policy.device
                    ),
                    convert_to_torch_tensor(sample_batch[OPPONENT_OBS], policy.device),
                    convert_to_torch_tensor(
                        sample_batch[OPPONENT_ACTION], policy.device
                    ),
                )
                .cpu()
                .detach()
                .numpy()
            )
        else:
            sample_batch[SampleBatch.VF_PREDS] = convert_to_numpy(
                policy.compute_central_vf(
                    sample_batch[SampleBatch.CUR_OBS],
                    sample_batch[OPPONENT_OBS],
                    sample_batch[OPPONENT_ACTION],
                )
            )
    else:
        # Policy hasn't been initialized yet, use zeros.
        sample_batch[OPPONENT_OBS] = np.zeros_like(sample_batch[SampleBatch.CUR_OBS])
        sample_batch[OPPONENT_ACTION] = np.zeros_like(sample_batch[SampleBatch.ACTIONS])
        sample_batch[SampleBatch.VF_PREDS] = np.zeros_like(
            sample_batch[SampleBatch.REWARDS], dtype=np.float32
        )

    completed = sample_batch["dones"][-1]
    if completed:
        last_r = 0.0
    else:
        last_r = sample_batch[SampleBatch.VF_PREDS][-1]

    train_batch = compute_advantages(
        sample_batch,
        last_r,
        policy.config["gamma"],
        policy.config["lambda"],
        use_gae=policy.config["use_gae"],
    )
    return train_batch
Example #10
0
    def learn_on_batch(self, postprocessed_batch):
        # Callback handling.
        learn_stats = {}
        self.callbacks.on_learn_on_batch(
            policy=self, train_batch=postprocessed_batch, result=learn_stats
        )

        pad_batch_to_sequences_of_same_size(
            postprocessed_batch,
            max_seq_len=self._max_seq_len,
            shuffle=False,
            batch_divisibility_req=self.batch_divisibility_req,
            view_requirements=self.view_requirements,
        )

        self._is_training = True
        postprocessed_batch = self._lazy_tensor_dict(postprocessed_batch)
        postprocessed_batch.set_training(True)
        stats = self._learn_on_batch_helper(postprocessed_batch)
        stats.update(
            {
                "custom_metrics": learn_stats,
                NUM_AGENT_STEPS_TRAINED: postprocessed_batch.count,
            }
        )
        return convert_to_numpy(stats)
    def get_state(self, sess: Optional["tf.Session"] = None):
        """Returns the current scale value.

        Returns:
            Union[float,tf.Tensor[float]]: The current scale value.
        """
        if sess:
            return sess.run(
                dict(
                    self._tf_state_op,
                    **{
                        "ou_state": self.ou_state,
                    }
                )
            )

        state = super().get_state()
        return dict(
            state,
            **{
                "ou_state": convert_to_numpy(self.ou_state)
                if self.framework != "tf"
                else self.ou_state,
            }
        )
Example #12
0
    def get_state(self, sess: Optional["tf.Session"] = None):
        """Returns the current scale value.

        Returns:
            Union[float,tf.Tensor[float]]: The current scale value.
        """
        if sess:
            return sess.run(self._tf_state_op)
        scale = self.scale_schedule(self.last_timestep)
        return {
            "cur_scale":
            convert_to_numpy(scale) if self.framework != "tf" else scale,
            "last_timestep":
            convert_to_numpy(self.last_timestep)
            if self.framework != "tf" else self.last_timestep,
        }
Example #13
0
    def action_log_likelihood(self, batch: SampleBatchType) -> TensorType:
        """Returns log likelihood for actions in given batch for policy.

        Computes likelihoods by passing the observations through the current
        policy's `compute_log_likelihoods()` method

        Args:
            batch: The SampleBatch or MultiAgentBatch to calculate action
                log likelihoods from. This batch/batches must contain OBS
                and ACTIONS keys.

        Returns:
            The probabilities of the actions in the batch, given the
            observations and the policy.
        """
        num_state_inputs = 0
        for k in batch.keys():
            if k.startswith("state_in_"):
                num_state_inputs += 1
        state_keys = ["state_in_{}".format(i) for i in range(num_state_inputs)]
        log_likelihoods: TensorType = self.policy.compute_log_likelihoods(
            actions=batch[SampleBatch.ACTIONS],
            obs_batch=batch[SampleBatch.OBS],
            state_batches=[batch[k] for k in state_keys],
            prev_action_batch=batch.get(SampleBatch.PREV_ACTIONS),
            prev_reward_batch=batch.get(SampleBatch.PREV_REWARDS),
            actions_normalized=True,
        )
        log_likelihoods = convert_to_numpy(log_likelihoods)
        return log_likelihoods
    def estimate(self, batch: SampleBatchType) -> Dict[str, Any]:
        """Compute off-policy estimates.

        Args:
            batch: The SampleBatch to run off-policy estimation on

        Returns:
            A dict consists of the following metrics:
            - v_behavior: The discounted return averaged over episodes in the batch
            - v_behavior_std: The standard deviation corresponding to v_behavior
            - v_target: The estimated discounted return for `self.policy`,
            averaged over episodes in the batch
            - v_target_std: The standard deviation corresponding to v_target
            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
            - v_gain_std: The standard deviation corresponding to v_gain
        """
        batch = self.convert_ma_batch_to_sample_batch(batch)
        self.check_action_prob_in_batch(batch)
        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
        for episode in batch.split_by_episode():
            rewards, old_prob = episode["rewards"], episode["action_prob"]
            log_likelihoods = compute_log_likelihoods_from_input_dict(
                self.policy, episode)
            new_prob = np.exp(convert_to_numpy(log_likelihoods))

            # calculate importance ratios
            p = []
            for t in range(episode.count):
                if t == 0:
                    pt_prev = 1.0
                else:
                    pt_prev = p[t - 1]
                p.append(pt_prev * new_prob[t] / old_prob[t])
            for t, v in enumerate(p):
                if t >= len(self.filter_values):
                    self.filter_values.append(v)
                    self.filter_counts.append(1.0)
                else:
                    self.filter_values[t] += v
                    self.filter_counts[t] += 1.0

            # calculate stepwise weighted IS estimate
            v_behavior = 0.0
            v_target = 0.0
            for t in range(episode.count):
                v_behavior += rewards[t] * self.gamma**t
                w_t = self.filter_values[t] / self.filter_counts[t]
                v_target += p[t] / w_t * rewards[t] * self.gamma**t

            estimates["v_behavior"].append(v_behavior)
            estimates["v_target"].append(v_target)
            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
        estimates["v_target_std"] = np.std(estimates["v_target"])
        estimates["v_target"] = np.mean(estimates["v_target"])
        estimates["v_gain_std"] = np.std(estimates["v_gain"])
        estimates["v_gain"] = np.mean(estimates["v_gain"])
        return estimates
Example #15
0
 def get_state(self) -> Union[Dict[str, TensorType], List[TensorType]]:
     state = super().get_state()
     state["_optimizer_variables"] = []
     for i, o in enumerate(self._optimizers):
         optim_state_dict = convert_to_numpy(o.state_dict())
         state["_optimizer_variables"].append(optim_state_dict)
     # Add exploration state.
     state["_exploration_state"] = self.exploration.get_state()
     return state
Example #16
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     q_t = torch.stack(self.get_tower_stats("q_t"))
     stats = {
         "actor_loss": torch.mean(torch.stack(self.get_tower_stats("actor_loss"))),
         "critic_loss": torch.mean(torch.stack(self.get_tower_stats("critic_loss"))),
         "mean_q": torch.mean(q_t),
         "max_q": torch.max(q_t),
         "min_q": torch.min(q_t),
     }
     return convert_to_numpy(stats)
Example #17
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     stats = {
         "policy_loss": self.p_loss,
         "total_loss": self.total_loss,
     }
     if self.config["beta"] != 0.0:
         stats[
             "moving_average_sqd_adv_norm"] = self._moving_average_sqd_adv_norm
         stats["vf_explained_var"] = self.explained_variance
         stats["vf_loss"] = self.v_loss
     return convert_to_numpy(stats)
Example #18
0
    def test_fqe_model(self):
        # Test FQETorchModel for:
        # (1) Check that it does not modify the underlying batch during training
        # (2) Check that the stoppign criteria from FQE are working correctly
        # (3) Check that using fqe._compute_action_probs equals brute force
        # iterating over all actions with policy.compute_log_likelihoods
        fqe = FQETorchModel(
            policy=self.algo.get_policy(),
            gamma=self.gamma,
            **self.q_model_config,
        )
        tmp_batch = copy.deepcopy(self.batch)
        losses = fqe.train(self.batch)

        # Make sure FQETorchModel.train() does not modify self.batch
        check(tmp_batch, self.batch)

        # Make sure FQE stopping criteria are respected
        assert (
            len(losses) == fqe.n_iters or losses[-1] < fqe.delta
        ), f"FQE.train() terminated early in {len(losses)} steps with final loss"
        f"{losses[-1]} for n_iters: {fqe.n_iters} and delta: {fqe.delta}"

        # Test fqe._compute_action_probs against "brute force" method
        # of computing log_prob for each possible action individually
        # using policy.compute_log_likelihoods
        obs = torch.tensor(self.batch["obs"], device=fqe.device)
        action_probs = fqe._compute_action_probs(obs)
        action_probs = convert_to_numpy(action_probs)

        tmp_probs = []
        for act in range(fqe.policy.action_space.n):
            tmp_actions = np.zeros_like(self.batch["actions"]) + act
            log_probs = fqe.policy.compute_log_likelihoods(
                actions=tmp_actions,
                obs_batch=self.batch["obs"],
            )
            tmp_probs.append(torch.exp(log_probs))
        tmp_probs = torch.stack(tmp_probs).transpose(0, 1)
        tmp_probs = convert_to_numpy(tmp_probs)
        check(action_probs, tmp_probs, decimals=3)
Example #19
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     return convert_to_numpy({
         "cur_lr":
         self.cur_lr,
         "entropy_coeff":
         self.entropy_coeff,
         "policy_entropy":
         torch.mean(torch.stack(self.get_tower_stats("entropy"))),
         "policy_loss":
         torch.mean(torch.stack(self.get_tower_stats("pi_err"))),
         "vf_loss":
         torch.mean(torch.stack(self.get_tower_stats("value_err"))),
     })
Example #20
0
    def action_prob(self, batch: SampleBatchType) -> np.ndarray:
        """Returns the probs for the batch actions for the current policy."""

        num_state_inputs = 0
        for k in batch.keys():
            if k.startswith("state_in_"):
                num_state_inputs += 1
        state_keys = ["state_in_{}".format(i) for i in range(num_state_inputs)]
        log_likelihoods: TensorType = self.policy.compute_log_likelihoods(
            actions=batch[SampleBatch.ACTIONS],
            obs_batch=batch[SampleBatch.CUR_OBS],
            state_batches=[batch[k] for k in state_keys],
            prev_action_batch=batch.data.get(SampleBatch.PREV_ACTIONS),
            prev_reward_batch=batch.data.get(SampleBatch.PREV_REWARDS))
        return convert_to_numpy(log_likelihoods)
Example #21
0
    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
        """Returns the calculated loss in a stats dict.

        Args:
            policy: The Policy object.
            train_batch: The data used for training.

        Returns:
            Dict[str, TensorType]: The stats dict.
        """

        return convert_to_numpy({
            "policy_loss":
            torch.mean(torch.stack(self.get_tower_stats("policy_loss"))),
        })
Example #22
0
        def compute_actions_from_input_dict(
            self,
            input_dict: Dict[str, TensorType],
            explore: bool = None,
            timestep: Optional[int] = None,
            episodes: Optional[List[Episode]] = None,
            **kwargs,
        ) -> Tuple[TensorType, List[TensorType], Dict[str, TensorType]]:

            if not self.config.get(
                    "eager_tracing") and not tf1.executing_eagerly():
                tf1.enable_eager_execution()

            self._is_training = False

            explore = explore if explore is not None else self.explore
            timestep = timestep if timestep is not None else self.global_timestep
            if isinstance(timestep, tf.Tensor):
                timestep = int(timestep.numpy())

            # Pass lazy (eager) tensor dict to Model as `input_dict`.
            input_dict = self._lazy_tensor_dict(input_dict)
            input_dict.set_training(False)

            # Pack internal state inputs into (separate) list.
            state_batches = [
                input_dict[k] for k in input_dict.keys() if "state_in" in k[:8]
            ]
            self._state_in = state_batches
            self._is_recurrent = state_batches != []

            # Call the exploration before_compute_actions hook.
            self.exploration.before_compute_actions(timestep=timestep,
                                                    explore=explore,
                                                    tf_sess=self.get_session())

            ret = self._compute_actions_helper(
                input_dict,
                state_batches,
                # TODO: Passing episodes into a traced method does not work.
                None if self.config["eager_tracing"] else episodes,
                explore,
                timestep,
            )
            # Update our global timestep by the batch size.
            self.global_timestep.assign_add(
                tree.flatten(ret[0])[0].shape.as_list()[0])
            return convert_to_numpy(ret)
Example #23
0
        def compute_gradients(self, postprocessed_batch: SampleBatch) -> \
                Tuple[ModelGradients, Dict[str, TensorType]]:

            pad_batch_to_sequences_of_same_size(
                postprocessed_batch,
                shuffle=False,
                max_seq_len=self._max_seq_len,
                batch_divisibility_req=self.batch_divisibility_req,
                view_requirements=self.view_requirements,
            )

            self._is_training = True
            self._lazy_tensor_dict(postprocessed_batch)
            postprocessed_batch.set_training(True)
            grads_and_vars, grads, stats = self._compute_gradients_helper(
                postprocessed_batch)
            return convert_to_numpy((grads, stats))
Example #24
0
    def estimate(self, batch: SampleBatchType) -> OffPolicyEstimate:
        self.check_can_estimate_for(batch)
        estimates = []
        # Split data into train and test batches
        for train_episodes, test_episodes in train_test_split(
            batch,
            self.train_test_split_val,
            self.k,
        ):

            # Train Q-function
            if train_episodes:
                # Reinitialize model
                self.model.reset()
                train_batch = SampleBatch.concat_samples(train_episodes)
                losses = self.train(train_batch)
                self.losses.append(losses)

            # Calculate direct method OPE estimates
            for episode in test_episodes:
                rewards = episode["rewards"]
                v_old = 0.0
                v_new = 0.0
                for t in range(episode.count):
                    v_old += rewards[t] * self.gamma ** t

                init_step = episode[0:1]
                init_obs = np.array([init_step[SampleBatch.OBS]])
                all_actions = np.arange(self.policy.action_space.n, dtype=float)
                init_step[SampleBatch.ACTIONS] = all_actions
                action_probs = np.exp(self.action_log_likelihood(init_step))
                v_value = self.model.estimate_v(init_obs, action_probs)
                v_new = convert_to_numpy(v_value).item()

                estimates.append(
                    OffPolicyEstimate(
                        self.name,
                        {
                            "v_old": v_old,
                            "v_new": v_new,
                            "v_gain": v_new / max(1e-8, v_old),
                        },
                    )
                )
        return estimates
Example #25
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     return convert_to_numpy({
         "cur_lr":
         self.cur_lr,
         "total_loss":
         torch.mean(torch.stack(self.get_tower_stats("total_loss"))),
         "policy_loss":
         torch.mean(torch.stack(self.get_tower_stats("pi_loss"))),
         "entropy":
         torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))),
         "entropy_coeff":
         self.entropy_coeff,
         "var_gnorm":
         global_norm(self.model.trainable_variables()),
         "vf_loss":
         torch.mean(torch.stack(self.get_tower_stats("vf_loss"))),
         "vf_explained_var":
         torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))),
     })
Example #26
0
    def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
        """Stats function for APPO. Returns a dict with important loss stats.

        Args:
            policy: The Policy to generate stats for.
            train_batch: The SampleBatch (already) used for training.

        Returns:
            Dict[str, TensorType]: The stats dict.
        """
        stats_dict = {
            "cur_lr":
            self.cur_lr,
            "total_loss":
            torch.mean(torch.stack(self.get_tower_stats("total_loss"))),
            "policy_loss":
            torch.mean(torch.stack(self.get_tower_stats("mean_policy_loss"))),
            "entropy":
            torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))),
            "entropy_coeff":
            self.entropy_coeff,
            "var_gnorm":
            global_norm(self.model.trainable_variables()),
            "vf_loss":
            torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))),
            "vf_explained_var":
            torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))),
        }

        if self.config["vtrace"]:
            is_stat_mean = torch.mean(self._is_ratio, [0, 1])
            is_stat_var = torch.var(self._is_ratio, [0, 1])
            stats_dict["mean_IS"] = is_stat_mean
            stats_dict["var_IS"] = is_stat_var

        if self.config["use_kl_loss"]:
            stats_dict["kl"] = torch.mean(
                torch.stack(self.get_tower_stats("mean_kl_loss")))
            stats_dict["KL_Coeff"] = self.kl_coeff

        return convert_to_numpy(stats_dict)
Example #27
0
    def estimate(self, batch: SampleBatchType) -> Dict[str, Any]:
        """Compute off-policy estimates.

        Args:
            batch: The SampleBatch to run off-policy estimation on

        Returns:
            A dict consists of the following metrics:
            - v_behavior: The discounted return averaged over episodes in the batch
            - v_behavior_std: The standard deviation corresponding to v_behavior
            - v_target: The estimated discounted return for `self.policy`,
            averaged over episodes in the batch
            - v_target_std: The standard deviation corresponding to v_target
            - v_gain: v_target / max(v_behavior, 1e-8), averaged over episodes
            - v_gain_std: The standard deviation corresponding to v_gain
        """
        batch = self.convert_ma_batch_to_sample_batch(batch)
        self.check_action_prob_in_batch(batch)
        estimates = {"v_behavior": [], "v_target": [], "v_gain": []}
        # Calculate Direct Method OPE estimates
        for episode in batch.split_by_episode():
            rewards = episode["rewards"]
            v_behavior = 0.0
            v_target = 0.0
            for t in range(episode.count):
                v_behavior += rewards[t] * self.gamma ** t

            init_step = episode[0:1]
            v_target = self.model.estimate_v(init_step)
            v_target = convert_to_numpy(v_target).item()

            estimates["v_behavior"].append(v_behavior)
            estimates["v_target"].append(v_target)
            estimates["v_gain"].append(v_target / max(v_behavior, 1e-8))
        estimates["v_behavior_std"] = np.std(estimates["v_behavior"])
        estimates["v_behavior"] = np.mean(estimates["v_behavior"])
        estimates["v_target_std"] = np.std(estimates["v_target"])
        estimates["v_target"] = np.mean(estimates["v_target"])
        estimates["v_gain_std"] = np.std(estimates["v_gain"])
        estimates["v_gain"] = np.mean(estimates["v_gain"])
        return estimates
Example #28
0
 def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
     return convert_to_numpy({
         "cur_kl_coeff":
         self.kl_coeff,
         "cur_lr":
         self.cur_lr,
         "total_loss":
         torch.mean(torch.stack(self.get_tower_stats("total_loss"))),
         "policy_loss":
         torch.mean(torch.stack(self.get_tower_stats("mean_policy_loss"))),
         "vf_loss":
         torch.mean(torch.stack(self.get_tower_stats("mean_vf_loss"))),
         "vf_explained_var":
         torch.mean(torch.stack(self.get_tower_stats("vf_explained_var"))),
         "kl":
         torch.mean(torch.stack(self.get_tower_stats("mean_kl_loss"))),
         "entropy":
         torch.mean(torch.stack(self.get_tower_stats("mean_entropy"))),
         "entropy_coeff":
         self.entropy_coeff,
     })
Example #29
0
def _process_policy_eval_results(
    *,
    to_eval: Dict[PolicyID, List[PolicyEvalData]],
    eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]],
    active_episodes: Dict[str, MultiAgentEpisode],
    active_envs: Set[int],
    off_policy_actions: MultiEnvDict,
    policies: Dict[PolicyID, Policy],
    clip_actions: bool,
) -> Dict[EnvID, Dict[AgentID, EnvActionType]]:
    """Process the output of policy neural network evaluation.

    Records policy evaluation results into the given episode objects and
    returns replies to send back to agents in the env.

    Args:
        to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs
            to lists of PolicyEvalData objects.
        eval_results (Dict[PolicyID, List]): Mapping of policy IDs to list of
            actions, rnn-out states, extra-action-fetches dicts.
        active_episodes (Dict[str, MultiAgentEpisode]): Mapping from
            episode ID to currently ongoing MultiAgentEpisode object.
        active_envs (Set[int]): Set of non-terminated env ids.
        off_policy_actions (dict): Doubly keyed dict of env-ids -> agent ids ->
            off-policy-action, returned by a `BaseEnv.poll()` call.
        policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy.
        clip_actions (bool): Whether to clip actions to the action space's
            bounds.

    Returns:
        actions_to_send: Nested dict of env id -> agent id -> actions to be
            sent to Env (np.ndarrays).
    """

    actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \
        defaultdict(dict)

    # type: int
    for env_id in active_envs:
        actions_to_send[env_id] = {}  # at minimum send empty dict

    # type: PolicyID, List[PolicyEvalData]
    for policy_id, eval_data in to_eval.items():
        actions: TensorStructType = eval_results[policy_id][0]
        actions = convert_to_numpy(actions)

        rnn_out_cols: StateBatch = eval_results[policy_id][1]
        pi_info_cols: dict = eval_results[policy_id][2]

        # In case actions is a list (representing the 0th dim of a batch of
        # primitive actions), try to convert it first.
        if isinstance(actions, list):
            actions = np.array(actions)

        # Store RNN state ins/outs and extra-action fetches to episode.
        for f_i, column in enumerate(rnn_out_cols):
            pi_info_cols["state_out_{}".format(f_i)] = column

        policy: Policy = _get_or_raise(policies, policy_id)
        # Split action-component batches into single action rows.
        actions: List[EnvActionType] = unbatch(actions)
        # type: int, EnvActionType
        for i, action in enumerate(actions):
            # Clip if necessary.
            if clip_actions:
                clipped_action = clip_action(action,
                                             policy.action_space_struct)
            else:
                clipped_action = action

            env_id: int = eval_data[i].env_id
            agent_id: AgentID = eval_data[i].agent_id
            episode: MultiAgentEpisode = active_episodes[env_id]
            episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
            episode._set_last_pi_info(
                agent_id, {k: v[i]
                           for k, v in pi_info_cols.items()})
            if env_id in off_policy_actions and \
                    agent_id in off_policy_actions[env_id]:
                episode._set_last_action(agent_id,
                                         off_policy_actions[env_id][agent_id])
            else:
                episode._set_last_action(agent_id, action)

            assert agent_id not in actions_to_send[env_id]
            actions_to_send[env_id][agent_id] = clipped_action

    return actions_to_send
Example #30
0
    def _compute_action_helper(self, input_dict, state_batches, seq_lens,
                               explore, timestep):
        """Shared forward pass logic (w/ and w/o trajectory view API).

        Returns:
            A tuple consisting of a) actions, b) state_out, c) extra_fetches.
        """
        explore = explore if explore is not None else self.config["explore"]
        timestep = timestep if timestep is not None else self.global_timestep
        self._is_recurrent = state_batches is not None and state_batches != []

        # Switch to eval mode.
        if self.model:
            self.model.eval()

        if is_overridden(self.action_sampler_fn):
            action_dist = dist_inputs = None
            actions, logp, state_out = self.action_sampler_fn(
                self.model,
                obs_batch=input_dict,
                state_batches=state_batches,
                explore=explore,
                timestep=timestep,
            )
        else:
            # Call the exploration before_compute_actions hook.
            self.exploration.before_compute_actions(explore=explore,
                                                    timestep=timestep)
            if is_overridden(self.action_distribution_fn):
                dist_inputs, dist_class, state_out = self.action_distribution_fn(
                    self.model,
                    obs_batch=input_dict,
                    state_batches=state_batches,
                    seq_lens=seq_lens,
                    explore=explore,
                    timestep=timestep,
                    is_training=False,
                )
            else:
                dist_class = self.dist_class
                dist_inputs, state_out = self.model(input_dict, state_batches,
                                                    seq_lens)

            if not (isinstance(dist_class, functools.partial)
                    or issubclass(dist_class, TorchDistributionWrapper)):
                raise ValueError(
                    "`dist_class` ({}) not a TorchDistributionWrapper "
                    "subclass! Make sure your `action_distribution_fn` or "
                    "`make_model_and_action_dist` return a correct "
                    "distribution class.".format(dist_class.__name__))
            action_dist = dist_class(dist_inputs, self.model)

            # Get the exploration action from the forward results.
            actions, logp = self.exploration.get_exploration_action(
                action_distribution=action_dist,
                timestep=timestep,
                explore=explore)

        input_dict[SampleBatch.ACTIONS] = actions

        # Add default and custom fetches.
        extra_fetches = self.extra_action_out(input_dict, state_batches,
                                              self.model, action_dist)

        # Action-dist inputs.
        if dist_inputs is not None:
            extra_fetches[SampleBatch.ACTION_DIST_INPUTS] = dist_inputs

        # Action-logp and action-prob.
        if logp is not None:
            extra_fetches[SampleBatch.ACTION_PROB] = torch.exp(logp.float())
            extra_fetches[SampleBatch.ACTION_LOGP] = logp

        # Update our global timestep by the batch size.
        self.global_timestep += len(input_dict[SampleBatch.CUR_OBS])

        return convert_to_numpy((actions, state_out, extra_fetches))