Ejemplo n.º 1
0
def test_building_shared_agent_policies():
    env = build_dummy_structured_env()
    model_config = _dummy_model_config()
    policy = _dummy_policy_for(model_config, env, agent_counts_dict={0: 1, 1: 3})

    assert len(policy.networks) == 2
    assert [0, 1] == list(policy.networks.keys())

    assert isinstance(policy.network_for(actor_id=ActorID(0, 0)), DummyPolicyNet)
    assert isinstance(policy.network_for(actor_id=ActorID(1, 0)), DummyPolicyNet)
Ejemplo n.º 2
0
def test_building_separated_and_shared_agent_policies():
    env = build_dummy_structured_env()
    model_config = _dummy_model_config()
    model_config["policy"]["substeps_with_separate_agent_nets"] = [1]
    policy = _dummy_policy_for(model_config, env, agent_counts_dict={0: 2, 1: 3})

    assert len(policy.networks) == 4
    assert [0, (1, 0), (1, 1), (1, 2)] == list(policy.networks.keys())

    assert isinstance(policy.network_for(actor_id=ActorID(0, 0)), DummyPolicyNet)
    assert isinstance(policy.network_for(actor_id=ActorID(1, 0)), DummyPolicyNet)
Ejemplo n.º 3
0
def _mock_structured_spaces_record(step_no: int, done: bool = False):
    return StructuredSpacesRecord(substep_records=[
        _mock_spaces_record(actor_id=ActorID(0, 0),
                            keys=["x", "y"],
                            value=[step_no * 10, step_no * 10],
                            reward=step_no),
        _mock_spaces_record(actor_id=ActorID(1, 0),
                            keys=["z"],
                            value=[step_no * 10 + 1],
                            reward=step_no,
                            done=done),
    ])
Ejemplo n.º 4
0
    def actor_id(self) -> ActorID:
        """Returns the currently executed actor along with the policy id. The id is unique only with
        respect to the policies (every policy has its own actor 0).
        Note that identities of done actors can not be reused in the same rollout.

        :return: The current actor, as tuple (policy id, actor number).
        """
        return ActorID(step_key=0, agent_id=0)
Ejemplo n.º 5
0
def _mock_space_record(value: int):
    substep_record = SpacesRecord(
        actor_id=ActorID(0, 0),
        observation=dict(observation=np.array(value)),
        action=dict(action=np.array(value)),
        reward=value,
        done=value > 0)

    return StructuredSpacesRecord(substep_records=[substep_record])
Ejemplo n.º 6
0
def test_default_policy():
    """ unit tests """
    default_policy = DefaultPolicy({
        "policy_0": DummyFlatPolicy("action_0"),
        "policy_1": DummyFlatPolicy("action_1")
    })
    action = default_policy.compute_action(observation={},
                                           actor_id=ActorID("policy_0", 0),
                                           deterministic=True)
    assert "action_0" in action
    action = default_policy.compute_action(observation={},
                                           actor_id=ActorID("policy_1", 0),
                                           deterministic=True)
    assert "action_1" in action

    default_policy = DefaultPolicy({"policy_0": DummyFlatPolicy("action_0")})
    action = default_policy.compute_action(observation={},
                                           actor_id=None,
                                           deterministic=True)
    assert "action_0" in action
Ejemplo n.º 7
0
def test_serialized_torch_policy():
    """ unit tests """

    # init structured env
    env = build_dummy_structured_env()

    model_config = {
        "_target_": CustomModelComposer,
        "distribution_mapper_config": {},
        "policy": {
            "_target_":
            "maze.perception.models.policies.ProbabilisticPolicyComposer",
            "networks": [{
                "_target_":
                "maze.test.shared_test_utils.dummy_models.actor_model.DummyPolicyNet",
                "non_lin": "torch.nn.SELU"
            }, {
                "_target_":
                "maze.test.shared_test_utils.dummy_models.actor_model.DummyPolicyNet",
                "non_lin": "torch.nn.SELU"
            }],
            "substeps_with_separate_agent_nets": []
        },
        "critic": None
    }

    # no critic
    composer = CustomModelComposer(
        action_spaces_dict=env.action_spaces_dict,
        observation_spaces_dict=env.observation_spaces_dict,
        agent_counts_dict=env.agent_counts_dict,
        distribution_mapper_config=[],
        policy=model_config["policy"],
        critic=model_config["critic"])

    # dump state dict
    state_dict = composer.policy.state_dict()
    torch.save(state_dict, "state_dict.pt")

    SpacesConfig(composer.action_spaces_dict, composer.observation_spaces_dict,
                 composer.agent_counts_dict).save("spaces_config.pkl")

    # init policy
    policy = SerializedTorchPolicy(model=model_config,
                                   state_dict_file="state_dict.pt",
                                   spaces_dict_file="spaces_config.pkl",
                                   device="cpu")

    action = policy.compute_action(observation=env.observation_space.sample(),
                                   actor_id=ActorID(0, 0))
    assert isinstance(action, dict)
Ejemplo n.º 8
0
def _mock_spaces_trajectory_record(step_count: int):
    """Produce an episode record with maze_states and maze_actions corresponding to the step no."""
    episode_record = SpacesTrajectoryRecord("test")

    for i in range(step_count):
        substep_record = SpacesRecord(
            actor_id=ActorID(0, 0),
            observation=dict(observation=np.array(i)),
            action=dict(action=np.array(i)),
            reward=0,
            done=i == step_count - 1)
        episode_record.step_records.append(
            StructuredSpacesRecord(substep_records=[substep_record]))

    return episode_record
Ejemplo n.º 9
0
def random_env_steps(env: ObservationNormalizationWrapper,
                     steps: int) -> np.ndarray:
    """Randomly interact with environment"""
    observations = []
    obs = env.reset()
    observations.append(obs["observation"])
    for _ in range(steps):
        action = env.sampling_policy.compute_action(obs,
                                                    maze_state=None,
                                                    env=env,
                                                    actor_id=ActorID(0, 0),
                                                    deterministic=False)
        obs, rew, done, info = env.step(action)
        observations.append(obs["observation"])
        if done:
            obs = env.reset()
            observations.append(obs["observation"])
    return np.vstack(observations)
Ejemplo n.º 10
0
    def _build_critic_input_space_dict(self) -> Dict[StepKeyType, spaces.Dict]:
        """Build the critic input from the given observation input and a dummy pass through the policy network (in case
        shared embeddings are used).

        :return: The dict holding the enw critic input spaces dict, needed for building the model.
        """
        critic_input_spaces_dict = dict(
            copy.deepcopy(self.observation_spaces_dict))
        for step_key, obs_space in self.observation_spaces_dict.items():
            step_observation = dict()
            for obs_key, obs in obs_space.spaces.items():
                if isinstance(obs, spaces.Box) and np.any(obs_space[obs_key].low == np.finfo(np.float32).min) or \
                        np.any(obs_space[obs_key].high == np.finfo(np.float32).max):
                    # In case any of the lower or upper bounds of the space are infinite, resample the values.
                    step_observation[obs_key] = np.random.randn(
                        *obs.shape).astype(np.float32)
                else:
                    # Set random generator to None.. In case the observation spaces have been loaded from
                    #   a file not setting this may lead to problems
                    obs._np_random = None
                    step_observation[obs_key] = obs.sample()
            tmp_out = self._policy_composer.policy.compute_substep_policy_output(
                step_observation, actor_id=ActorID(step_key, 0))
            if tmp_out.embedding_logits is not None:
                new_observation_space = dict()
                critic_input = StateCriticStepInput.build(
                    tmp_out, step_observation)
                for in_key, in_value in critic_input.tensor_dict.items():
                    if in_key in critic_input_spaces_dict[step_key]:
                        new_observation_space[
                            in_key] = critic_input_spaces_dict[step_key][
                                in_key]
                    else:
                        new_observation_space[in_key] = spaces.Box(
                            low=np.finfo(np.float32).min,
                            high=np.finfo(np.float32).max,
                            shape=in_value.shape,
                            dtype=np.float32)
                critic_input_spaces_dict[step_key] = gym.spaces.Dict(
                    dict(new_observation_space))
        return critic_input_spaces_dict
Ejemplo n.º 11
0
    def act(
        self,
        maze_state: MazeStateType,
        reward: Union[None, float, np.ndarray, Any],
        done: bool,
        info: Union[None, Dict[Any, Any]],
        events: Optional[List[EventRecord]] = None,
        actor_id: ActorID = ActorID(0, 0)
    ) -> MazeActionType:
        """Query the agent for MazeAction derived from the given state.

        Passes the state etc. to the agent's thread, where it is integrated into an ordinary env rollout loop.
        In the first step, an env reset call is propagated through the env wrapper stack on agent's thread.

        :param maze_state: Current state of the environment.
        :param reward: Reward for the previous step (can be null in initial step)
        :param done: Whether the external environment is done
        :param info: Info dictionary
        :param events: List of events to be recorded for this step (mainly useful for statistics and event logs)
        :param actor_id: Optional ID of the actor to run next (comprised of policy_id and agent_id)
        :return: MazeAction from the agent
        """
        if self.rollout_done:
            raise RuntimeError(
                "External env has been declared done already. Please create a new connector object for"
                "a new episode.")

        self.external_core_env.set_actor_id(actor_id)
        self.state_queue.put((maze_state, reward, done, info, events))
        # Here, the MazeAction is suspended until the agent on the second thread runs another step and the MazeAction
        # is passed back through the MazeAction queue.
        maze_action = self.maze_action_queue.get()

        # If exception occurs in the agent thread, it will be passed back using this same queue as an exception report.
        if isinstance(maze_action, ExceptionReport):
            exc_report = maze_action
            raise RuntimeError("Error encountered in agent thread:\n" +
                               exc_report.traceback) from exc_report.exception

        return maze_action
Ejemplo n.º 12
0
    def converted_from(cls, state_record: StateRecord, conversion_env: MazeEnv, first_step_in_episode: bool) \
            -> 'StructuredSpacesRecord':
        """Convert a state record (containing a Maze state and Maze action) into a spaces record (containing
        raw actions and observations for each sub-step).

        Maze states and actions are converted to spaces using the supplied conversion env -- it's action and
        observation interfaces, as well as the wrapper stack determine the format of the converted actions
        and observations.

        This is useful e.g. for behavioral cloning, when we have recorded Maze states and actions from teacher runs,
        and now need to convert these into raw actions and observations to be fed to a model.

        Note that multi-agent scenarios are not supported yet (the conversion only support a single
        action-observation pair per sub-step key).

        :param state_record: State record to convert.
        :param conversion_env: Environment to use for the conversion. Determines the format of the resulting spaces.
        :param first_step_in_episode: Flag whether this is the first step in an episode (to resets stateful wrapper)
        :return: Converted spaces record.
        """
        obs = state_record.maze_state.observation if isinstance(
            state_record.maze_state, RawState) else state_record.maze_state
        action = state_record.maze_action.action if isinstance(
            state_record.maze_action,
            RawMazeAction) else state_record.maze_action

        obs, action = conversion_env.get_observation_and_action_dicts(
            obs, action, first_step_in_episode)

        substep_records = [
            SpacesRecord(actor_id=ActorID(substep_key, 0),
                         observation=obs[substep_key],
                         action=action[substep_key],
                         reward=None,
                         done=None) for substep_key in obs.keys()
        ]

        substep_records[-1].done = state_record.done

        return StructuredSpacesRecord(substep_records=substep_records)
Ejemplo n.º 13
0
def debatch_actor_ids(actor_ids: List[ActorID]) -> List[ActorID]:
    """If actor ids are returned by the dataloader they are batched in the step_key and agent_id fields. Since a single
        batch of value should correlate in agent_id and step key, this holds redundant information and should be
        reversed for it to work properly with all other parts of the framework. """
    for idx in range(len(actor_ids)):
        actor_id_tmp = actor_ids[idx]
        if isinstance(actor_id_tmp.agent_id, torch.Tensor) and isinstance(
                actor_id_tmp.step_key, torch.Tensor):
            assert len(set(
                actor_id_tmp.agent_id.tolist())) == 1, actor_id_tmp.agent_id
            assert len(set(
                actor_id_tmp.step_key.tolist())) == 1, actor_id_tmp.step_key
            actor_ids[idx] = ActorID(step_key=actor_id_tmp.step_key[0].item(),
                                     agent_id=actor_id_tmp.agent_id[0].item())
        elif isinstance(actor_id_tmp.agent_id, int) and isinstance(
                actor_id_tmp.step_key, (int, str)):
            pass
        else:
            raise NotImplementedError(
                f'Not implemented batched actor id type found: {type(actor_id_tmp.agent_id)}'
            )

    return actor_ids
Ejemplo n.º 14
0
 def actor_id(self) -> ActorID:
     """Single-step, single-agent environment"""
     return ActorID(step_key=0, agent_id=0)
Ejemplo n.º 15
0
    def _compute_policy_loss(self, worker_output: StructuredSpacesRecord) -> \
            Tuple[Dict[Union[str, int], torch.Tensor],
                  Dict[Union[str, int], Union[torch.Tensor, Dict[str, torch.Tensor]]],
                  Dict[Union[str, int], Union[torch.Tensor, Dict[str, torch.Tensor]]],
                  Dict[Union[str, int], torch.Tensor]]:
        """Compute the critic losses.

        :param worker_output: The batched output of the workers.
        :return: The policy losses as well a few other metrics needed for the entropy loss computation and stats.
        """

        # Sample actions and compute action log probabilities (continuous steps)/ action probabilities (discrete steps)
        policy_losses, action_entropies, action_log_probs, actions_sampled = dict(
        ), dict(), dict(), dict()
        action_probs = dict()

        for step_key in self.sub_step_keys:
            step_obs = worker_output.observations_dict[step_key]
            learner_policy_out = self.learner_model.policy.compute_substep_policy_output(
                step_obs, ActorID(step_key, 0))
            learner_action = learner_policy_out.prob_dist.sample()

            # Average the logp_policy of all actions in this step (all steps if shared critic)
            if self.learner_model.critic.only_discrete_spaces[step_key]:
                probs_policy = {
                    action_key: logits_to_probs(x)
                    for action_key, x in
                    learner_policy_out.action_logits.items()
                }
                logp_policy = {
                    action_key: torch.log(x + (x == 0.0).float() * 1e-8)
                    for action_key, x in probs_policy.items()
                }
            else:
                probs_policy = None
                logp_policy = torch.stack(
                    list(
                        learner_policy_out.prob_dist.log_prob(
                            learner_action).values())).mean(dim=0)

            action_probs[step_key] = probs_policy
            action_log_probs[step_key] = logp_policy
            actions_sampled[step_key] = learner_action
            action_entropies[step_key] = learner_policy_out.entropy

        # Predict Q values
        q_values = self.learner_model.critic.predict_q_values(
            worker_output.observations_dict,
            actions_sampled,
            gather_output=False)
        if len(q_values) < len(self.sub_step_keys):
            assert len(q_values) == 1
            critic_key = list(q_values.keys())[0]
            q_values = {
                step_key: q_values[critic_key]
                for step_key in self.sub_step_keys
            }

        # Compute loss
        for step_key in self.sub_step_keys:
            action_log_probs_step = action_log_probs[step_key]
            q_values_step = q_values[step_key]

            if self.learner_model.critic.only_discrete_spaces[step_key]:
                action_probs_step = action_probs[step_key]

                policy_losses_per_action = list()
                # Compute the policy loss for each individual action
                for action_key in action_log_probs_step.keys():
                    q_action_key = action_key + '_q_values'
                    action_q_values = torch.stack([
                        q_values_sub_critic[q_action_key]
                        for q_values_sub_critic in q_values_step
                    ]).min(dim=0).values
                    q_term = (self.curr_entropy_coef[step_key] *
                              action_log_probs_step[action_key] -
                              action_q_values)
                    action_policy_loss = torch.matmul(
                        action_probs_step[action_key].unsqueeze(-2),
                        q_term.unsqueeze(-1)).squeeze(-1).squeeze(-1)
                    policy_losses_per_action.append(action_policy_loss)
                # Sum the losses of all action together
                policy_losses_per_step = torch.stack(
                    policy_losses_per_action).sum(dim=0)
                # Average the losses w.r.t. to the batch
                policy_losses[step_key] = policy_losses_per_step.mean()
            else:
                # Do not detach q_values in discrete setting
                q_value_per_step = torch.stack(q_values_step).min(dim=0).values
                # Average the losses w.r.t. to the batch
                policy_losses[step_key] = torch.mean(
                    (self.curr_entropy_coef[step_key] * action_log_probs_step -
                     q_value_per_step))

        return policy_losses, action_probs, action_log_probs, action_entropies
Ejemplo n.º 16
0
    def _compute_critic_loss(self, worker_output: StructuredSpacesRecord) -> \
            Tuple[List[Dict[Union[str, int], torch.Tensor]], List[Dict[Union[str, int], torch.Tensor]]]:
        """Compute the critic losses.

        :param worker_output: The batched output of the workers.
        :return: Return the critic losses as a list w.r.t. the number of critics used + mean values for stats.
        """
        next_actions = dict()
        next_actions_logits = dict()
        next_action_log_probs = dict()

        q_values_selected = self.learner_model.critic.predict_q_values(
            worker_output.observations_dict,
            worker_output.actions_dict,
            gather_output=True)
        q_values_mean = {
            step_key: [
                curr_q.detach().mean().item()
                if isinstance(curr_q, torch.Tensor) else torch.stack(
                    list(curr_q.values())).mean(dim=1).detach().mean().item()
                for curr_q in q_values_list
            ]
            for step_key, q_values_list in q_values_selected.items()
        }

        with torch.no_grad():
            for step_key in self.sub_step_keys:
                next_policy_output = self.learner_model.policy.compute_substep_policy_output(
                    worker_output.next_observations_dict[step_key],
                    actor_id=ActorID(step_key, 0))
                next_action = next_policy_output.prob_dist.sample()

                next_action_log_probs[
                    step_key] = next_policy_output.prob_dist.log_prob(
                        next_action)
                next_actions_logits[
                    step_key] = next_policy_output.action_logits
                next_actions[step_key] = next_action

            next_q_values = self.learner_model.critic.predict_next_q_values(
                worker_output.next_observations_dict, next_actions,
                next_actions_logits, next_action_log_probs,
                self.curr_entropy_coef)

            target_q_values = dict()

            # TODO: Take into account all rewards, not just from the last sub-step
            last_rewards = list(worker_output.rewards_dict.values())[-1]
            last_dones = list(worker_output.dones_dict.values())[-1]

            for step_key, next_q_value_per_step in next_q_values.items():
                if self.learner_model.critic.only_discrete_spaces[step_key]:
                    assert isinstance(next_q_value_per_step, dict)
                    target_q_values[step_key] = {
                        action_key:
                        (last_rewards + (~last_dones).float() *
                         self.algorithm_config.gamma * next_action_q_value)
                        for action_key, next_action_q_value in
                        next_q_value_per_step.items()
                    }
                else:
                    assert isinstance(next_q_value_per_step, torch.Tensor)
                    target_q_values[step_key] = (
                        last_rewards + (~last_dones).float() *
                        self.algorithm_config.gamma * next_q_value_per_step)

        q_losses = dict()
        for step_key in q_values_selected:
            per_critic_values = list()
            for q_values_per_sub_critic in q_values_selected[step_key]:
                target_q_values_per_step = target_q_values[step_key]
                if self.learner_model.critic.only_discrete_spaces[step_key]:
                    assert isinstance(q_values_per_sub_critic, dict)
                    per_action_per_critic_loss = list()
                    for action_key, q_values_per_action in q_values_per_sub_critic.items(
                    ):
                        org_action_key = action_key.replace('_q_values', '')
                        per_action_loss = (
                            q_values_per_action -
                            target_q_values_per_step[org_action_key]
                        ).pow(2).mean()
                        per_action_per_critic_loss.append(per_action_loss)
                    # Sum the q_value of individual action in one step together
                    per_critic_values.append(
                        torch.stack(per_action_per_critic_loss).sum(dim=0))
                else:
                    assert isinstance(q_values_per_sub_critic, torch.Tensor)
                    per_critic_values.append(
                        (q_values_per_sub_critic -
                         target_q_values_per_step).pow(2).mean())
            q_losses[step_key] = per_critic_values

        # Transpose list of lists to get into right format and sum values from different steps together (but keep them
        #   separate w.r.t. the q network
        q_losses = [dict(zip(q_losses, t)) for t in zip(*q_losses.values())]
        q_values_mean = [
            dict(zip(q_values_mean, t)) for t in zip(*q_values_mean.values())
        ]
        return q_losses, q_values_mean
Ejemplo n.º 17
0
 def actor_id(self) -> ActorID:
     """
     :return The action id (sub_step_index, not used)
     """
     return ActorID(step_key=self._sub_step_index, agent_id=0)
Ejemplo n.º 18
0
 def actor_id(self) -> ActorID:
     """Single-step, two-agent environment"""
     return ActorID(step_key=0, agent_id=self.current_agent)