def test_actor_critic(ac_type, lstm):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None)
    obs_shapes = [(obs_size, )]
    act_size = [2]
    stream_names = [f"stream_name{n}" for n in range(4)]
    action_spec = ActionSpec.create_continuous(act_size[0])
    actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
    if lstm:
        sample_obs = torch.ones(
            (1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones(
            (1, network_settings.memory.sequence_length, actor.memory_size))
    else:
        sample_obs = torch.ones((1, obs_size))
        memories = torch.tensor([])
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
    value_out, memories_out = actor.critic_pass([sample_obs], [],
                                                memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
            assert memories_out.shape == memories.shape
        else:
            assert value_out[stream].shape == (1, )

    # Test get_dist_and_value
    dists, value_out, mem_out = actor.get_dist_and_value([sample_obs], [],
                                                         memories=memories)
    if mem_out is not None:
        assert mem_out.shape == memories.shape
    for dist in dists:
        assert isinstance(dist, GaussianDistInstance)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
        else:
            assert value_out[stream].shape == (1, )
Beispiel #2
0
def test_advance(mocked_clear_update_buffer, mocked_save_model):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1,)],
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2,)),
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    policy_queue.get_nowait()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 5):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that if the policy doesn't update, we don't push it to the queue
    trainer.set_is_policy_updating(False)
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there nothing  in the policy queue
        with pytest.raises(AgentManagerQueue.Empty):
            policy_queue.get_nowait()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
    assert mocked_save_model.call_count == 0
Beispiel #3
0
def test_multinetworkbody_visual(with_actions):
    torch.manual_seed(0)
    act_size = 2
    n_agents = 3
    obs_size = 4
    vis_obs_size = (84, 84, 3)
    network_settings = NetworkSettings()
    obs_shapes = [(obs_size, ), vis_obs_size]
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    networkbody = MultiAgentNetworkBody(
        create_observation_specs_with_shapes(obs_shapes), network_settings,
        action_spec)
    optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3)
    sample_obs = [[0.1 * torch.ones(
        (1, obs_size))] + [0.1 * torch.ones((1, 84, 84, 3))]
                  for _ in range(n_agents)]
    # simulate baseline in POCA
    sample_act = [
        AgentAction(0.1 * torch.ones((1, 2)),
                    [0.1 * torch.ones(1) for _ in range(act_size)])
        for _ in range(n_agents - 1)
    ]
    for _ in range(300):
        if with_actions:
            encoded, _ = networkbody(obs_only=sample_obs[:1],
                                     obs=sample_obs[1:],
                                     actions=sample_act)
        else:
            encoded, _ = networkbody(obs_only=sample_obs, obs=[], actions=[])

        assert encoded.shape == (1, network_settings.hidden_units)
        # Try to force output to 1
        loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for _enc in encoded.flatten().tolist():
        assert _enc == pytest.approx(1.0, abs=0.1)
Beispiel #4
0
def test_poca_end_episode():
    name_behavior_id = "test_trainer"
    trainer = POCATrainer(
        name_behavior_id,
        10,
        TrainerSettings(max_steps=100, checkpoint_interval=10,
                        summary_freq=20),
        True,
        False,
        0,
        "mock_model_path",
    )
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]),
                                 ActionSpec.create_discrete((2, )))
    parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
        name_behavior_id)
    mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec)
    trainer.add_policy(parsed_behavior_id, mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=behavior_spec.observation_specs,
        max_step_complete=False,
        action_spec=behavior_spec.action_spec,
        num_other_agents_in_group=2,
        group_reward=1.0,
        is_terminal=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Test that some trajectoories have been injested
    for reward in trainer.collected_group_rewards.values():
        assert reward == 10
    # Test end episode
    trainer.end_episode()
    assert len(trainer.collected_group_rewards.keys()) == 0
Beispiel #5
0
def create_mock_steps(
    num_agents: int,
    observation_shapes: List[Tuple],
    action_spec: ActionSpec,
    done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :List observation_shapes: A List of the observation spaces in your steps
    :int num_vector_acts: Number of actions in your action space
    :bool discrete: Whether or not action space is discrete
    :bool done: Whether all the agents in the batch are done
    """
    obs_list = []
    for _shape in observation_shapes:
        obs_list.append(np.ones((num_agents, ) + _shape, dtype=np.float32))
    action_mask = None
    if action_spec.is_discrete():
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_spec.discrete_branches  # type: ignore
        ]  # type: ignore

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    behavior_spec = BehaviorSpec(observation_shapes, action_spec)
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(obs_list, reward, interrupted, agent_id),
        )
    else:
        return (
            DecisionSteps(obs_list, reward, agent_id, action_mask),
            TerminalSteps.empty(behavior_spec),
        )
Beispiel #6
0
def test_specs():
    specs = ActionSpec.create_continuous(3)
    assert specs.discrete_branches == ()
    assert specs.discrete_size == 0
    assert specs.continuous_size == 3
    assert specs.empty_action(5).continuous.shape == (5, 3)
    assert specs.empty_action(5).continuous.dtype == np.float32

    specs = ActionSpec.create_discrete((3, ))
    assert specs.discrete_branches == (3, )
    assert specs.discrete_size == 1
    assert specs.continuous_size == 0
    assert specs.empty_action(5).discrete.shape == (5, 1)
    assert specs.empty_action(5).discrete.dtype == np.int32

    specs = ActionSpec(3, (3, ))
    assert specs.continuous_size == 3
    assert specs.discrete_branches == (3, )
    assert specs.discrete_size == 1
    assert specs.empty_action(5).continuous.shape == (5, 3)
    assert specs.empty_action(5).continuous.dtype == np.float32
    assert specs.empty_action(5).discrete.shape == (5, 1)
    assert specs.empty_action(5).discrete.dtype == np.int32
Beispiel #7
0
from mlagents_envs.base_env import ActionSpec


@pytest.fixture
def dummy_config():
    return ppo_dummy_config()


VECTOR_ACTION_SPACE = 2
VECTOR_OBS_SPACE = 8
DISCRETE_ACTION_SPACE = [3, 3, 3, 2]
BUFFER_INIT_SAMPLES = 64
NUM_AGENTS = 12

CONTINUOUS_ACTION_SPEC = ActionSpec.create_continuous(VECTOR_ACTION_SPACE)
DISCRETE_ACTION_SPEC = ActionSpec.create_discrete(tuple(DISCRETE_ACTION_SPACE))


def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
Beispiel #8
0
def test_agent_deletion():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
    )
    mock_done_decision_step, mock_done_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
        done=True,
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
    )

    # Run 3 trajectories, with different workers (to simulate different agents)
    add_calls = []
    remove_calls = []
    for _ep in range(3):
        for _ in range(5):
            processor.add_experiences(
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            add_calls.append(
                mock.call([get_global_agent_id(_ep, 0)], fake_action_outputs["action"])
            )
        processor.add_experiences(
            mock_done_decision_step, mock_done_terminal_step, _ep, fake_action_info
        )
        # Make sure we don't add experiences from the prior agents after the done
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))

    policy.save_previous_action.assert_has_calls(add_calls)
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
    assert len(processor.last_step_result.keys()) == 0

    # check that steps with immediate dones don't add to dicts
    processor.add_experiences(
        mock_done_decision_step, mock_done_terminal_step, 0, ActionInfo.empty()
    )
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
    assert len(processor.last_step_result.keys()) == 0
    def __init__(
            self,
            brain_names,
            step_size=STEP_SIZE,
            num_visual=0,
            num_vector=1,
            num_var_len=0,
            vis_obs_size=VIS_OBS_SIZE,
            vec_obs_size=OBS_SIZE,
            var_len_obs_size=VAR_LEN_SIZE,
            action_sizes=(1, 0),
    ):
        super().__init__()
        self.num_visual = num_visual
        self.num_vector = num_vector
        self.num_var_len = num_var_len
        self.vis_obs_size = vis_obs_size
        self.vec_obs_size = vec_obs_size
        self.var_len_obs_size = var_len_obs_size
        continuous_action_size, discrete_action_size = action_sizes
        discrete_tuple = tuple(2 for _ in range(discrete_action_size))
        action_spec = ActionSpec(continuous_action_size, discrete_tuple)
        self.total_action_size = (continuous_action_size + discrete_action_size
                                  )  # to set the goals/positions
        self.action_spec = action_spec
        self.behavior_spec = BehaviorSpec(self._make_observation_specs(),
                                          action_spec)
        self.action_spec = action_spec
        self.names = brain_names
        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}

        # Concatenate the arguments for a consistent random seed
        seed = (
            brain_names,
            step_size,
            num_visual,
            num_vector,
            num_var_len,
            vis_obs_size,
            vec_obs_size,
            var_len_obs_size,
            action_sizes,
        )
        self.random = random.Random(str(seed))

        self.goal: Dict[str, int] = {}
        self.action = {}
        self.rewards: Dict[str, float] = {}
        self.final_rewards: Dict[str, List[float]] = {}
        self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
        self.agent_id: Dict[str, int] = {}
        self.step_size = step_size  # defines the difficulty of the test
        # Allow to be used as a UnityEnvironment during tests
        self.academy_capabilities = None

        for name in self.names:
            self.agent_id[name] = 0
            self.goal[name] = self.random.choice([-1, 1])
            self.rewards[name] = 0
            self.final_rewards[name] = []
            self._reset_agent(name)
            self.action[name] = None
            self.step_result[name] = None
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8, )] +
                                                               num_vis_obs *
                                                               [(84, 84, 3)]),
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_steps.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0,
                              ActionInfo.empty())
    for _ in range(5):
        processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0,
                                  fake_action_info)

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_specs=create_observation_specs_with_shapes([(8, )] +
                                                               num_vis_obs *
                                                               [(84, 84, 3)]),
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0,
                              ActionInfo.empty())
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
Beispiel #11
0
    create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import GAILSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
    create_agent_buffer, )
from mlagents.trainers.torch.components.reward_providers.gail_reward_provider import (
    DiscriminatorNetwork, )

CONTINUOUS_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                os.pardir, os.pardir) + "/test.demo")
DISCRETE_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              os.pardir, os.pardir) + "/testdcvis.demo")
SEED = [42]

ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2)
ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3))
ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20, ))


@pytest.mark.parametrize("behavior_spec",
                         [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)])
def test_construction(behavior_spec: BehaviorSpec) -> None:
    gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH)
    gail_rp = GAILRewardProvider(behavior_spec, gail_settings)
    assert gail_rp.name == "GAIL"


@pytest.mark.parametrize("behavior_spec",
                         [BehaviorSpec([(8, )], ACTIONSPEC_CONTINUOUS)])
def test_factory(behavior_spec: BehaviorSpec) -> None:
Beispiel #12
0
def make_fake_trajectory(
    length: int,
    observation_shapes: List[Tuple],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    action_probs = np.ones(
        int(
            np.sum(action_spec.discrete_branches) +
            action_spec.continuous_size),
        dtype=np.float32,
    )
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
            obs.append(np.ones(_shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_spec.discrete_branches
                        ]  # type: ignore
                       if action_spec.is_discrete() else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    obs = []
    for _shape in observation_shapes:
        obs.append(np.ones(_shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)
Beispiel #13
0
    torch.onnx.export(network,
                      dummy_input,
                      EXPORT_FILE,
                      opset_version=9,
                      input_names=input_names,
                      output_names=output_names,
                      dynamic_axes=dynamic_axes)


if __name__ == '__main__':
    obs_spec = [
        ObservationSpec(shape=(16, ),
                        dimension_property=(DimensionProperty.UNSPECIFIED, ),
                        observation_type=ObservationType.DEFAULT)
    ]
    act_spec = ActionSpec(continuous_size=4, discrete_branches=())
    net_settings = NetworkSettings(normalize=False,
                                   hidden_units=256,
                                   num_layers=2,
                                   vis_encode_type=EncoderType.SIMPLE,
                                   memory=NetworkSettings.MemorySettings(
                                       sequence_length=64, memory_size=256))

    network = SerializableSimpleActor(obs_spec, net_settings, act_spec)
    state_dict = torch.load(MODEL_FILE, map_location=torch.device('cpu'))
    filtered_sd = {
        i: j
        for i, j in state_dict['Policy'].items() if 'critic' not in i
    }
    network.load_state_dict(filtered_sd)
Beispiel #14
0
from mlagents.trainers.buffer import BufferKey
import pytest
import numpy as np
from mlagents.trainers.torch.components.reward_providers import (
    ExtrinsicRewardProvider,
    create_reward_provider,
)
from mlagents_envs.base_env import BehaviorSpec, ActionSpec
from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType
from mlagents.trainers.tests.torch.test_reward_providers.utils import (
    create_agent_buffer, )
from mlagents.trainers.tests.dummy_config import create_observation_specs_with_shapes

ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5)
ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3))


@pytest.mark.parametrize(
    "behavior_spec",
    [
        BehaviorSpec(create_observation_specs_with_shapes([(10, )]),
                     ACTIONSPEC_CONTINUOUS),
        BehaviorSpec(create_observation_specs_with_shapes([(10, )]),
                     ACTIONSPEC_TWODISCRETE),
    ],
)
def test_construction(behavior_spec: BehaviorSpec) -> None:
    settings = RewardSignalSettings()
    settings.gamma = 0.2
    extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings)
    assert extrinsic_rp.gamma == 0.2
Beispiel #15
0
def create_action_model(inp_size, act_size, deterministic=False):
    mask = torch.ones([1, act_size ** 2])
    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
    action_model = ActionModel(inp_size, action_spec, deterministic=deterministic)
    return action_model, mask
def create_action_model(inp_size, act_size):
    mask = torch.ones([1, act_size * 2])
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    action_model = ActionModel(inp_size, action_spec)
    return action_model, mask
Beispiel #17
0
def basic_behavior_spec():
    dummy_actionspec = ActionSpec.create_continuous(1)
    dummy_groupspec = BehaviorSpec([(1, )], dummy_actionspec)
    return dummy_groupspec
Beispiel #18
0
def test_group_statuses():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=4,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        grouped=True,
    )
    fake_action_info = _create_action_info(4, mock_decision_steps.agent_id)
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(2):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Make terminal steps for some dead agents
    _, mock_terminal_steps_2 = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=True,
        grouped=True,
        agent_ids=[2, 3],
    )
    # Make decision steps continue for other agents
    mock_decision_steps_2, _ = mb.create_mock_steps(
        num_agents=2,
        observation_specs=create_observation_specs_with_shapes([(8,)]),
        action_spec=ActionSpec.create_continuous(2),
        done=False,
        grouped=True,
        agent_ids=[0, 1],
    )

    processor.add_experiences(
        mock_decision_steps_2, mock_terminal_steps_2, 0, fake_action_info
    )
    # Continue to add for remaining live agents
    fake_action_info = _create_action_info(4, mock_decision_steps_2.agent_id)
    for _ in range(3):
        processor.add_experiences(
            mock_decision_steps_2, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that four trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 4

    # Get the first trajectory, which should have been agent 2 (one of the killed agents)
    trajectory = tqueue.put.call_args_list[0][0][-1]
    assert len(trajectory.steps) == 3
    # Make sure trajectory has the right Groupmate Experiences.
    # All three steps should contain all agents
    for step in trajectory.steps:
        assert len(step.group_status) == 3

    # Last trajectory should be the longest. It should be that of agent 1, one of the surviving agents.
    trajectory = tqueue.put.call_args_list[-1][0][-1]
    assert len(trajectory.steps) == 5

    # Make sure trajectory has the right Groupmate Experiences.
    # THe first 3 steps should contain all of the obs (that 3rd step is also the terminal step of 2 of the agents)
    for step in trajectory.steps[0:3]:
        assert len(step.group_status) == 3
    # After 2 agents has died, there should only be 1 group status.
    for step in trajectory.steps[3:]:
        assert len(step.group_status) == 1
def create_behavior_spec(num_visual, num_vector, vector_size):
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * int(num_visual) + [(vector_size,)] * int(num_vector),
        ActionSpec.create_discrete((1,)),
    )
    return behavior_spec
Beispiel #20
0
def make_fake_trajectory(
    length: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
    num_other_agents_in_group: int = 0,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    for _i in range(length - 1):
        obs = []
        for obs_spec in observation_specs:
            obs.append(np.ones(obs_spec.shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = ActionTuple(
            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
        )
        action_probs = LogProbsTuple(
            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
        )
        action_mask = (
            [
                [False for _ in range(branch)]
                for branch in action_spec.discrete_branches
            ]  # type: ignore
            if action_spec.is_discrete()
            else None
        )
        if action_spec.is_discrete():
            prev_action = np.ones(action_size, dtype=np.int32)
        else:
            prev_action = np.ones(action_size, dtype=np.float32)

        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        group_status = []
        for _ in range(num_other_agents_in_group):
            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
            group_status=group_status,
            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
    for obs_spec in observation_specs:
        obs.append(np.ones(obs_spec.shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
        group_status=group_status,
        group_reward=0,
    )
    steps_list.append(last_experience)
    return Trajectory(
        steps=steps_list,
        agent_id=agent_id,
        behavior_id=behavior_id,
        next_obs=obs,
        next_group_obs=[obs] * num_other_agents_in_group,
    )
def test_actor_critic(ac_type, lstm):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True
    )
    obs_shapes = [(obs_size,)]
    act_size = 2
    mask = torch.ones([1, act_size * 2])
    stream_names = [f"stream_name{n}" for n in range(4)]
    # action_spec = ActionSpec.create_continuous(act_size[0])
    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
    actor = ac_type(obs_shapes, network_settings, action_spec, stream_names)
    if lstm:
        sample_obs = torch.ones((1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones(
            (1, network_settings.memory.sequence_length, actor.memory_size)
        )
    else:
        sample_obs = torch.ones((1, obs_size))
        memories = torch.tensor([])
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
    value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (network_settings.memory.sequence_length,)
            assert memories_out.shape == memories.shape
        else:
            assert value_out[stream].shape == (1,)

    # Test get action stats and_value
    action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
        [sample_obs], [], memories=memories, masks=mask
    )
    if lstm:
        assert action.continuous_tensor.shape == (64, 2)
    else:
        assert action.continuous_tensor.shape == (1, 2)

    assert len(action.discrete_list) == 2
    for _disc in action.discrete_list:
        if lstm:
            assert _disc.shape == (64, 1)
        else:
            assert _disc.shape == (1, 1)

    if mem_out is not None:
        assert mem_out.shape == memories.shape
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (network_settings.memory.sequence_length,)
        else:
            assert value_out[stream].shape == (1,)

    # Test normalization
    actor.update_normalization(sample_obs)
    if isinstance(actor, SeparateActorCritic):
        for act_proc, crit_proc in zip(
            actor.network_body.vector_processors,
            actor.critic.network_body.vector_processors,
        ):
            assert compare_models(act_proc, crit_proc)