Esempio n. 1
0
def log_probs_from_logits_and_actions_and_spaces(
        policy_logits: List[TorchActionType],
        actions: List[TorchActionType],
        distribution_mapper: DistributionMapper) \
        -> Tuple[List[TorchActionType], List[DictProbabilityDistribution]]:
    """Computes action log-probs from policy logits, actions and acton_spaces.

    In the notation used throughout documentation and comments, T refers to the
    time dimension ranging from 0 to T-1. B refers to the batch size and
    NUM_ACTIONS refers to the number of actions.

    :param policy_logits: A list (w.r.t. the substeps of the env) of dicts (w.r.t. the actions) of tensors
        of un-normalized log-probabilities (shape list[dict[str,[T, B, NUM_ACTIONS]]])
    :param actions: An list (w.r.t. the substeps of the env) of dicts (w.r.t. the actions) of tensors
        (list[dict[str,[T, B]]])
    :param distribution_mapper: A distribution mapper providing a mapping of action heads to distributions.

    :return: A list (w.r.t. the substeps of the env) of dicts (w.r.t. the actions) of tensors of shape [T, B]
        corresponding to the sampling log probability of the chosen action w.r.t. the policy.
        And a list (w.r.t. the substeps of the env) of DictProbability distributions corresponding to the step-action-
        distributions.
    """
    log_probs = list()
    step_action_dists = list()
    for step_policy_logits, step_actions in zip(policy_logits, actions):
        step_action_dist = distribution_mapper.logits_dict_to_distribution(
            logits_dict=step_policy_logits, temperature=1.0)
        log_probs.append(step_action_dist.log_prob(step_actions))
        step_action_dists.append(step_action_dist)
    return log_probs, step_action_dists
Esempio n. 2
0
def test_distribution_mapper():
    """ distribution test """

    # action space
    act_space = spaces.Dict(
        spaces={
            "selection":
            spaces.Discrete(10),
            "order":
            spaces.MultiBinary(15),
            "scale_input":
            spaces.Box(shape=(5, ), low=0, high=100, dtype=np.float64),
            "order_by_weight":
            spaces.Box(shape=(5, ), low=0, high=100, dtype=np.float64)
        })

    # default config
    config = [{
        "action_space":
        spaces.Box,
        "distribution":
        "maze.distributions.squashed_gaussian.SquashedGaussianProbabilityDistribution"
    }, {
        "action_head":
        "order_by_weight",
        "distribution":
        "maze.distributions.beta.BetaProbabilityDistribution"
    }]

    # initialize distribution mapper
    distribution_mapper = DistributionMapper(action_space=act_space,
                                             distribution_mapper_config=config)
    repr(distribution_mapper)

    # assign action heads to registered distributions
    logits_dict = dict()
    for action_head in act_space.spaces.keys():
        logits_shape = distribution_mapper.required_logits_shape(action_head)

        logits_tensor = torch.from_numpy(np.random.randn(*logits_shape))
        torch_dist = distribution_mapper.action_head_distribution(
            action_head=action_head, logits=logits_tensor, temperature=1.0)
        logits_dict[action_head] = logits_tensor

        # check if distributions are correctly assigned
        if action_head == "selection":
            assert isinstance(torch_dist, CategoricalProbabilityDistribution)
        elif action_head == "order":
            assert isinstance(torch_dist, BernoulliProbabilityDistribution)
        elif action_head == "scale_input":
            assert isinstance(torch_dist,
                              SquashedGaussianProbabilityDistribution)
        elif action_head == "order_by_weight":
            assert isinstance(torch_dist, BetaProbabilityDistribution)

    # test dictionary distribution mapping
    dict_dist = distribution_mapper.logits_dict_to_distribution(
        logits_dict=logits_dict, temperature=1.0)
    assert isinstance(dict_dist, DictProbabilityDistribution)
Esempio n. 3
0
def test_dummy_model_with_dummy_network():
    """
    Unit test for the DummyStructuredEnvironment
    """
    maze_env = build_dummy_maze_env()

    # init the distribution_mapper with the flat action space
    distribution_mapper_config = [{
        "action_space":
        spaces.Box,
        "distribution":
        "maze.distributions.squashed_gaussian.SquashedGaussianProbabilityDistribution"
    }]
    distribution_mapper = DistributionMapper(
        action_space=maze_env.action_space,
        distribution_mapper_config=distribution_mapper_config)

    obs_shapes = observation_spaces_to_in_shapes(
        maze_env.observation_spaces_dict)

    dummy_actor = DummyPolicyNet(
        obs_shapes=obs_shapes[0],
        action_logits_shapes={
            key: distribution_mapper.required_logits_shape(key)
            for key in maze_env.action_space.spaces.keys()
        },
        non_lin=nn.Tanh)

    dummy_critic = DummyValueNet(obs_shapes=obs_shapes[0], non_lin=nn.Tanh)

    obs_np = maze_env.reset()
    obs = {k: torch.from_numpy(v) for k, v in obs_np.items()}

    for i in range(100):
        logits_dict = dummy_actor(obs)
        prob_dist = distribution_mapper.logits_dict_to_distribution(
            logits_dict=logits_dict, temperature=1.0)
        sampled_actions = prob_dist.sample()

        obs_np, _, _, _ = maze_env.step(sampled_actions)
        obs = {k: torch.from_numpy(v) for k, v in obs_np.items()}

        _ = dummy_critic(obs)
    maze_env.close()
def perform_test_maze_rllib_action_distribution(batch_dim: int):
    """ distribution test """
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)

    # action space
    act_space = spaces.Dict(spaces=dict(
        sorted({
            "selection":
            spaces.Discrete(10),
            "scale_input":
            spaces.Box(shape=(5, ), low=0, high=100, dtype=np.float64),
            "order_by_weight":
            spaces.Box(shape=(5, ), low=0, high=100, dtype=np.float64)
        }.items())))

    # default config
    config = [{
        "action_space":
        spaces.Box,
        "distribution":
        "maze.distributions.squashed_gaussian.SquashedGaussianProbabilityDistribution"
    }, {
        "action_head":
        "order_by_weight",
        "distribution":
        "maze.distributions.beta.BetaProbabilityDistribution"
    }]

    # initialize distribution mapper
    distribution_mapper = DistributionMapper(action_space=act_space,
                                             distribution_mapper_config=config)

    num_outputs = sum([
        np.prod(distribution_mapper.required_logits_shape(action_head))
        for action_head in distribution_mapper.action_space.spaces
    ])
    model_config = {
        'custom_model_config': {
            'maze_model_composer_config': {
                'distribution_mapper_config': config
            }
        }
    }
    assert num_outputs == MazeRLlibActionDistribution.required_model_output_shape(
        act_space, model_config)

    # assign action heads to registered distributions
    logits_dict = dict()
    for action_head in act_space.spaces.keys():

        logits_shape = distribution_mapper.required_logits_shape(action_head)
        if batch_dim > 0:
            logits_shape = (batch_dim, *logits_shape)

        logits_tensor = torch.from_numpy(np.random.randn(*logits_shape))
        logits_dict[action_head] = logits_tensor

    flat_input = torch.cat([tt for tt in logits_dict.values()], dim=-1)
    if batch_dim == 0:
        flat_input = flat_input.unsqueeze(0)
    fake_model = FakeRLLibModel(distribution_mapper)
    rllib_dist = MazeRLlibActionDistribution(flat_input,
                                             fake_model,
                                             temperature=0.5)

    # test dictionary distribution mapping
    maze_dist = distribution_mapper.logits_dict_to_distribution(
        logits_dict=logits_dict, temperature=0.5)

    for action_head in act_space.spaces.keys():
        maze_distribution = maze_dist.distribution_dict[action_head]
        maze_rllib_distribution = rllib_dist.maze_dist.distribution_dict[
            action_head]
        if hasattr(maze_distribution, 'logits'):
            assert torch.allclose(maze_distribution.logits,
                                  maze_rllib_distribution.logits)
        if hasattr(maze_distribution, 'low'):
            assert torch.allclose(maze_distribution.low,
                                  maze_rllib_distribution.low)
            assert torch.allclose(maze_distribution.high,
                                  maze_rllib_distribution.high)

    test_action_maze = maze_dist.sample()
    test_action_rllib = rllib_dist.sample()

    for action_head in act_space.spaces.keys():
        assert test_action_maze[action_head].shape == test_action_rllib[
            action_head].shape[int(batch_dim == 0):]

    maze_action = maze_dist.deterministic_sample()
    rllib_action = rllib_dist.deterministic_sample()

    for action_head in act_space.spaces.keys():
        assert torch.all(maze_action[action_head] == rllib_action[action_head])

    maze_action = convert_to_torch(maze_action,
                                   device=None,
                                   cast=torch.float64,
                                   in_place=True)
    rllib_action = convert_to_torch(rllib_action,
                                    device=None,
                                    cast=torch.float64,
                                    in_place=True)

    # This un-sqeeze is preformed by rllib before passing an action to log p
    for action_head in act_space.spaces.keys():
        if len(rllib_action[action_head].shape) == 0:
            rllib_action[action_head] = rllib_action[action_head].unsqueeze(0)

    logp_maze_dict = maze_dist.log_prob(maze_action)
    action_concat = torch.cat(
        [v.unsqueeze(-1) for v in logp_maze_dict.values()], dim=-1)
    logp_maze = torch.sum(action_concat, dim=-1)

    logp_rllib = rllib_dist.logp(rllib_action)
    if batch_dim == 0:
        logp_rllib = logp_rllib[0]

    assert torch.equal(logp_maze, logp_rllib)

    logp_rllib_2 = rllib_dist.sampled_action_logp()
    if batch_dim == 0:
        logp_rllib_2 = logp_rllib_2[0]

    assert torch.equal(logp_maze, logp_rllib_2)

    maze_entropy = maze_dist.entropy()
    rllib_entropy = rllib_dist.entropy()
    if batch_dim == 0:
        rllib_entropy = rllib_entropy[0]

    assert torch.equal(maze_entropy, rllib_entropy)

    logits_dict2 = dict()
    for action_head in act_space.spaces.keys():
        logits_shape = distribution_mapper.required_logits_shape(action_head)
        if batch_dim > 0:
            logits_shape = (batch_dim, *logits_shape)

        logits_tensor = torch.from_numpy(np.random.randn(*logits_shape))
        logits_dict2[action_head] = logits_tensor

    flat_input = torch.cat([tt for tt in logits_dict2.values()], dim=-1)
    if batch_dim == 0:
        flat_input = flat_input.unsqueeze(0)
    fake_model = FakeRLLibModel(distribution_mapper)
    rllib_dist_2 = MazeRLlibActionDistribution(flat_input,
                                               fake_model,
                                               temperature=0.5)

    # test dictionary distribution mapping
    maze_dist_2 = distribution_mapper.logits_dict_to_distribution(
        logits_dict=logits_dict2, temperature=0.5)

    maze_kl = maze_dist.kl(maze_dist_2)
    rllib_kl = rllib_dist.kl(rllib_dist_2)
    if batch_dim == 0:
        rllib_kl = rllib_kl[0]

    assert torch.equal(maze_kl, rllib_kl)
Esempio n. 5
0
            nn.Linear(in_features=obs_shapes[OBSERVATION_NAME][0],
                      out_features=16), nn.Tanh(),
            nn.Linear(in_features=16,
                      out_features=action_logits_shapes[ACTION_NAME][0]))

    def forward(self, in_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        """ forward pass. """
        return {ACTION_NAME: self.net(in_dict[OBSERVATION_NAME])}


# init default distribution mapper
distribution_mapper = DistributionMapper(
    action_space=spaces.Dict(spaces={ACTION_NAME: spaces.Discrete(2)}),
    distribution_mapper_config={})

# request required action logits shape and init a policy net
logits_shape = distribution_mapper.required_logits_shape(ACTION_NAME)
policy_net = PolicyNet(obs_shapes={OBSERVATION_NAME: (4, )},
                       action_logits_shapes={ACTION_NAME: logits_shape})

# compute action logits (here from random input)
logits_dict = policy_net({OBSERVATION_NAME: torch.randn(4)})

# init action sampling distribution from model output
dist = distribution_mapper.logits_dict_to_distribution(logits_dict,
                                                       temperature=1.0)

# sample action (e.g., {my_action: 1})
action = dist.sample()