Ejemplo n.º 1
0
    def __init__(self, brain, trainer_parameters, training, load, seed,
                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(BCTrainer, self).__init__(brain, trainer_parameters, training,
                                        run_id)
        self.policy = BCPolicy(seed, brain, trainer_parameters, load)
        self.n_sequences = 1
        self.cumulative_rewards = {}
        self.episode_steps = {}
        self.stats = {
            "Losses/Cloning Loss": [],
            "Environment/Episode Length": [],
            "Environment/Cumulative Reward": [],
        }

        self.batches_per_epoch = trainer_parameters["batches_per_epoch"]

        self.demonstration_buffer = AgentBuffer()
        self.evaluation_buffer = ProcessingBuffer()
Ejemplo n.º 2
0
def test_buffer():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    agent_3_buffer = construct_fake_buffer(3)
    a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=1,
                                                         sequential=True)
    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=3,
                                                         sequential=True)
    assert_array(
        np.array(a),
        np.array([
            [231, 232, 233],
            [241, 242, 243],
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2,
                                                         training_length=3,
                                                         sequential=False)
    assert_array(
        np.array(a),
        np.array([
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    agent_1_buffer.reset_agent()
    assert agent_1_buffer.num_experiences == 0
    update_buffer = AgentBuffer()
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_3_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20

    assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20,
                                                                          2)

    c = update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == update_buffer.keys()
    assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)
Ejemplo n.º 3
0
def test_buffer():
    b = construct_fake_processing_buffer()
    a = b[1]["vector_observation"].get_batch(batch_size=2,
                                             training_length=1,
                                             sequential=True)
    assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]]))
    a = b[2]["vector_observation"].get_batch(batch_size=2,
                                             training_length=3,
                                             sequential=True)
    assert_array(
        np.array(a),
        np.array([
            [231, 232, 233],
            [241, 242, 243],
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    a = b[2]["vector_observation"].get_batch(batch_size=2,
                                             training_length=3,
                                             sequential=False)
    assert_array(
        np.array(a),
        np.array([
            [251, 252, 253],
            [261, 262, 263],
            [271, 272, 273],
            [261, 262, 263],
            [271, 272, 273],
            [281, 282, 283],
        ]),
    )
    b[4].reset_agent()
    assert len(b[4]) == 0
    update_buffer = AgentBuffer()
    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    assert len(update_buffer["action"]) == 20

    assert np.array(update_buffer["action"]).shape == (20, 2)

    c = update_buffer.make_mini_batch(start=0, end=1)
    assert c.keys() == update_buffer.keys()
    assert np.array(c["action"]).shape == (1, 2)
Ejemplo n.º 4
0
 def _append_to_update_buffer(self,
                              agentbuffer_trajectory: AgentBuffer) -> None:
     """
     Append an AgentBuffer to the update buffer. If the trainer isn't training,
     don't update to avoid a memory leak.
     """
     if self.should_still_train:
         seq_len = (
             self.trainer_settings.network_settings.memory.sequence_length
             if self.trainer_settings.network_settings.memory is not None
             else 1)
         agentbuffer_trajectory.resequence_and_append(
             self.update_buffer, training_length=seq_len)
Ejemplo n.º 5
0
def test_buffer_save_load():
    original = construct_fake_buffer(3)
    import io

    write_buffer = io.BytesIO()
    original.save_to_file(write_buffer)

    loaded = AgentBuffer()
    loaded.load_from_file(write_buffer)

    assert len(original) == len(loaded)
    for k in original.keys():
        assert np.allclose(original[k], loaded[k])
Ejemplo n.º 6
0
 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # Make sure we have at least one reward_signal
     if not self.trainer_parameters["reward_signals"]:
         raise UnityTrainerException(
             "No reward signals were defined. At least one must be used with {}."
             .format(self.__class__.__name__))
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.collected_rewards = {"environment": {}}
     self.processing_buffer = ProcessingBuffer()
     self.update_buffer = AgentBuffer()
     self.episode_steps = {}
Ejemplo n.º 7
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    behavior_spec: BehaviorSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_decision_step, current_terminal_step = steps_from_proto(
            [current_pair_info.agent_info], behavior_spec
        )
        next_decision_step, next_terminal_step = steps_from_proto(
            [next_pair_info.agent_info], behavior_spec
        )
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
        )
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
            )

        next_done = len(next_terminal_step) == 1
        next_reward = 0
        if len(next_terminal_step) == 1:
            next_reward = next_terminal_step.reward[0]
        else:
            next_reward = next_decision_step.reward[0]
        current_obs = None
        if len(current_terminal_step) == 1:
            current_obs = list(current_terminal_step.values())[0].obs
        else:
            current_obs = list(current_decision_step.values())[0].obs

        demo_raw_buffer["done"].append(next_done)
        demo_raw_buffer["rewards"].append(next_reward)
        split_obs = SplitObservations.from_observations(current_obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
            )
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(
        demo_processed_buffer, batch_size=None, training_length=sequence_length
    )
    return demo_processed_buffer
Ejemplo n.º 8
0
def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b[ObsUtil.get_name_at(0)].append(
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 1,
                    100 * fake_agent_id + 10 * step + 2,
                    100 * fake_agent_id + 10 * step + 3,
                ],
                dtype=np.float32,
            ))
        b[BufferKey.CONTINUOUS_ACTION].append(
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 4,
                    100 * fake_agent_id + 10 * step + 5,
                ],
                dtype=np.float32,
            ))
        b[BufferKey.GROUP_CONTINUOUS_ACTION].append([
            np.array(
                [
                    100 * fake_agent_id + 10 * step + 4,
                    100 * fake_agent_id + 10 * step + 5,
                ],
                dtype=np.float32,
            )
        ] * 3)
    return b
Ejemplo n.º 9
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action["continuous_action"] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action["discrete_action"] = action_buffer.discrete

    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer
Ejemplo n.º 10
0
def test_sac_rnn_policy(dummy_config):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_sac_policy_mock(dummy_config,
                                    use_rnn=True,
                                    use_discrete=True,
                                    use_visual=False)
    step = mb.create_batchedstep_from_brainparams(policy.brain,
                                                  num_agents=NUM_AGENTS)
    run_out = policy.evaluate(step, list(step.agent_id))
    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))

    # Test update
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                 policy.brain,
                                 memory_size=8)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    update_buffer = AgentBuffer()
    buffer.resequence_and_append(update_buffer,
                                 training_length=policy.sequence_length)
    run_out = policy.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // policy.sequence_length,
    )
Ejemplo n.º 11
0
def test_obsutil_group_from_buffer():
    buff = AgentBuffer()
    # Create some obs
    for _ in range(3):
        buff[GroupObsUtil.get_name_at(0)].append(
            3 * [np.ones((5, ), dtype=np.float32)])
    # Some agents have died
    for _ in range(2):
        buff[GroupObsUtil.get_name_at(0)].append(
            1 * [np.ones((5, ), dtype=np.float32)])

    # Get the group obs, which will be a List of Lists of np.ndarray, where each element is the same
    # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by
    # NaNs.
    gobs = GroupObsUtil.from_buffer(buff, 1)
    # Agent 0 is full
    agent_0_obs = gobs[0]
    for obs in agent_0_obs:
        assert obs.shape == (buff.num_experiences, 5)
        assert not np.isnan(obs).any()

    agent_1_obs = gobs[1]
    for obs in agent_1_obs:
        assert obs.shape == (buff.num_experiences, 5)
        for i, _exp_obs in enumerate(obs):
            if i >= 3:
                assert np.isnan(_exp_obs).all()
            else:
                assert not np.isnan(_exp_obs).any()
Ejemplo n.º 12
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_obs = [
        np.random.normal(size=sen_spec.shape).astype(np.float32)
        for sen_spec in behavior_spec.sensor_specs
    ]
    next_obs = [
        np.random.normal(size=sen_spec.shape).astype(np.float32)
        for sen_spec in behavior_spec.sensor_specs
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action["continuous_action"] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action["discrete_action"] = action_buffer.discrete

    for _ in range(number):
        for i, obs in enumerate(curr_obs):
            buffer[ObsUtil.get_name_at(i)].append(obs)
        for i, obs in enumerate(next_obs):
            buffer[ObsUtil.get_name_at_next(i)].append(obs)
        buffer["actions"].append(action)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer
Ejemplo n.º 13
0
    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.steps[step + 1].obs)
            else:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.next_obs)

            for i, _ in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["visual_obs%d" % i].append(
                    vec_vis_obs.visual_observations[i])
                agent_buffer_trajectory["next_visual_obs%d" % i].append(
                    next_vec_vis_obs.visual_observations[i])
            agent_buffer_trajectory["vector_obs"].append(
                vec_vis_obs.vector_observations)
            agent_buffer_trajectory["next_vector_in"].append(
                next_vec_vis_obs.vector_observations)
            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)
            # Add the outputs of the last eval
            if exp.action_pre is not None:
                actions_pre = exp.action_pre
                agent_buffer_trajectory["actions_pre"].append(actions_pre)

            # value is a dictionary from name of reward to value estimate of the value head
            agent_buffer_trajectory["actions"].append(exp.action)
            agent_buffer_trajectory["action_probs"].append(exp.action_probs)

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
            if exp.action_mask is not None:
                mask = 1 - np.concatenate(exp.action_mask)
                agent_buffer_trajectory["action_mask"].append(mask,
                                                              padding_value=1)
            else:
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.
                agent_buffer_trajectory["action_mask"].append(np.ones(
                    exp.action_probs.shape, dtype=np.float32),
                                                              padding_value=1)

            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            vec_vis_obs = next_vec_vis_obs
        return agent_buffer_trajectory
Ejemplo n.º 14
0
def test_agent_action_group_from_buffer():
    buff = AgentBuffer()
    # Create some actions
    for _ in range(3):
        buff[BufferKey.GROUP_CONTINUOUS_ACTION].append(
            3 * [np.ones((5,), dtype=np.float32)]
        )
        buff[BufferKey.GROUP_DISCRETE_ACTION].append(
            3 * [np.ones((4,), dtype=np.float32)]
        )
    # Some agents have died
    for _ in range(2):
        buff[BufferKey.GROUP_CONTINUOUS_ACTION].append(
            1 * [np.ones((5,), dtype=np.float32)]
        )
        buff[BufferKey.GROUP_DISCRETE_ACTION].append(
            1 * [np.ones((4,), dtype=np.float32)]
        )

    # Get the group actions, which will be a List of Lists of AgentAction, where each element is the same
    # length as the AgentBuffer but contains only one agent's obs. Dead agents are padded by
    # NaNs.
    gact = AgentAction.group_from_buffer(buff)
    # Agent 0 is full
    agent_0_act = gact[0]
    assert agent_0_act.continuous_tensor.shape == (buff.num_experiences, 5)
    assert agent_0_act.discrete_tensor.shape == (buff.num_experiences, 4)

    agent_1_act = gact[1]
    assert agent_1_act.continuous_tensor.shape == (buff.num_experiences, 5)
    assert agent_1_act.discrete_tensor.shape == (buff.num_experiences, 4)
    assert (agent_1_act.continuous_tensor[0:3] > 0).all()
    assert (agent_1_act.continuous_tensor[3:] == 0).all()
    assert (agent_1_act.discrete_tensor[0:3] > 0).all()
    assert (agent_1_act.discrete_tensor[3:] == 0).all()
Ejemplo n.º 15
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
        # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
        # of what reward signals are actually present.
        self.cumulative_returns_since_policy_update: List[float] = []
        self.collected_rewards: Dict[str, Dict[str, int]] = {
            "environment": defaultdict(lambda: 0)
        }
        self.update_buffer: AgentBuffer = AgentBuffer()
        self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS,
                                          self.trainer_settings.as_dict())
        self.framework = self.trainer_settings.framework
        if self.framework == FrameworkType.PYTORCH and not torch_utils.is_available(
        ):
            raise UnityTrainerException(
                "To use the experimental PyTorch backend, install the PyTorch Python package first."
            )

        logger.debug(f"Using framework {self.framework.value}")

        self._next_save_step = 0
        self._next_summary_step = 0
        self.model_saver = self.create_model_saver(self.framework,
                                                   self.trainer_settings,
                                                   self.artifact_path,
                                                   self.load)
Ejemplo n.º 16
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape).astype(np.float32)
        for shape in behavior_spec.observation_shapes
    ]
    action = behavior_spec.action_spec.random_action(1)[0, :]
    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        buffer["actions"].append(action)
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    buffer["done"] = np.zeros(number, dtype=np.float32)
    return buffer
Ejemplo n.º 17
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_obs = [
        np.random.normal(size=obs_spec.shape).astype(np.float32)
        for obs_spec in behavior_spec.observation_specs
    ]
    next_obs = [
        np.random.normal(size=obs_spec.shape).astype(np.float32)
        for obs_spec in behavior_spec.observation_specs
    ]
    action_buffer = behavior_spec.action_spec.random_action(1)
    action = {}
    if behavior_spec.action_spec.continuous_size > 0:
        action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous
    if behavior_spec.action_spec.discrete_size > 0:
        action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete

    for _ in range(number):
        for i, obs in enumerate(curr_obs):
            buffer[ObsUtil.get_name_at(i)].append(obs)
        for i, obs in enumerate(next_obs):
            buffer[ObsUtil.get_name_at_next(i)].append(obs)
        # TODO
        # buffer[AgentBufferKey.ACTIONS].append(action)
        for _act_type, _act in action.items():
            buffer[_act_type].append(_act[0, :])
        # TODO was "rewards"
        buffer[BufferKey.ENVIRONMENT_REWARDS].append(
            np.ones(1, dtype=np.float32) * reward)
        buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32))
    buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32)
    return buffer
Ejemplo n.º 18
0
    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        obs = self.steps[0].obs
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_obs = self.steps[step + 1].obs
            else:
                next_obs = self.next_obs

            num_obs = len(obs)
            for i in range(num_obs):
                agent_buffer_trajectory[ObsUtil.get_name_at(i)].append(obs[i])
                agent_buffer_trajectory[ObsUtil.get_name_at_next(i)].append(
                    next_obs[i])

            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)

            # Adds the log prob and action of continuous/discrete separately
            agent_buffer_trajectory["continuous_action"].append(
                exp.action.continuous)
            agent_buffer_trajectory["discrete_action"].append(
                exp.action.discrete)
            agent_buffer_trajectory["continuous_log_probs"].append(
                exp.action_probs.continuous)
            agent_buffer_trajectory["discrete_log_probs"].append(
                exp.action_probs.discrete)

            # Store action masks if necessary. Note that 1 means active, while
            # in AgentExperience False means active.
            if exp.action_mask is not None:
                mask = 1 - np.concatenate(exp.action_mask)
                agent_buffer_trajectory["action_mask"].append(mask,
                                                              padding_value=1)
            else:
                # This should never be needed unless the environment somehow doesn't supply the
                # action mask in a discrete space.

                action_shape = exp.action.discrete.shape
                agent_buffer_trajectory["action_mask"].append(np.ones(
                    action_shape, dtype=np.float32),
                                                              padding_value=1)
            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            obs = next_obs
        return agent_buffer_trajectory
Ejemplo n.º 19
0
def test_buffer_truncate():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    update_buffer = AgentBuffer()
    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    # Test non-LSTM
    update_buffer.truncate(2)
    assert update_buffer.num_experiences == 2

    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    # Test LSTM, truncate should be some multiple of sequence_length
    update_buffer.truncate(4, sequence_length=3)
    assert update_buffer.num_experiences == 3
    for buffer_field in update_buffer.values():
        assert isinstance(buffer_field, AgentBufferField)
Ejemplo n.º 20
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    group_spec: AgentGroupSpec,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, current_pair_info in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        next_pair_info = pair_infos[idx + 1]
        current_step_info = batched_step_result_from_proto(
            [current_pair_info.agent_info], group_spec)
        next_step_info = batched_step_result_from_proto(
            [next_pair_info.agent_info], group_spec)
        previous_action = (np.array(pair_infos[idx].action_info.vector_actions,
                                    dtype=np.float32) * 0)
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions,
                dtype=np.float32)
        curr_agent_id = current_step_info.agent_id[0]
        current_agent_step_info = current_step_info.get_agent_step_result(
            curr_agent_id)
        next_agent_id = next_step_info.agent_id[0]
        next_agent_step_info = next_step_info.get_agent_step_result(
            next_agent_id)

        demo_raw_buffer["done"].append(next_agent_step_info.done)
        demo_raw_buffer["rewards"].append(next_agent_step_info.reward)
        split_obs = SplitObservations.from_observations(
            current_agent_step_info.obs)
        for i, obs in enumerate(split_obs.visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(obs)
        demo_raw_buffer["vector_obs"].append(split_obs.vector_observations)
        demo_raw_buffer["actions"].append(
            current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_step_info.done:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer,
                batch_size=None,
                training_length=sequence_length)
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(demo_processed_buffer,
                                          batch_size=None,
                                          training_length=sequence_length)
    return demo_processed_buffer
Ejemplo n.º 21
0
def test_clear_update_buffer():
    trainer = create_rl_trainer()
    trainer.processing_buffer = construct_fake_processing_buffer()
    trainer.update_buffer = AgentBuffer()
    trainer.processing_buffer.append_to_update_buffer(
        trainer.update_buffer, 2, batch_size=None, training_length=2
    )
    trainer.clear_update_buffer()
    for _, arr in trainer.update_buffer.items():
        assert len(arr) == 0
Ejemplo n.º 22
0
def make_demo_buffer(
    pair_infos: List[AgentInfoActionPairProto],
    brain_params: BrainParameters,
    sequence_length: int,
) -> AgentBuffer:
    # Create and populate buffer using experiences
    demo_raw_buffer = AgentBuffer()
    demo_processed_buffer = AgentBuffer()
    for idx, experience in enumerate(pair_infos):
        if idx > len(pair_infos) - 2:
            break
        current_pair_info = pair_infos[idx]
        next_pair_info = pair_infos[idx + 1]
        current_brain_info = BrainInfo.from_agent_proto(
            0, [current_pair_info.agent_info], brain_params
        )
        next_brain_info = BrainInfo.from_agent_proto(
            0, [next_pair_info.agent_info], brain_params
        )
        previous_action = (
            np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0
        )
        if idx > 0:
            previous_action = np.array(
                pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32
            )
        demo_raw_buffer["done"].append(next_brain_info.local_done[0])
        demo_raw_buffer["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            demo_raw_buffer["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
        if brain_params.vector_observation_space_size > 0:
            demo_raw_buffer["vector_obs"].append(
                current_brain_info.vector_observations[0]
            )
        demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions)
        demo_raw_buffer["prev_action"].append(previous_action)
        if next_brain_info.local_done[0]:
            demo_raw_buffer.resequence_and_append(
                demo_processed_buffer, batch_size=None, training_length=sequence_length
            )
            demo_raw_buffer.reset_agent()
    demo_raw_buffer.resequence_and_append(
        demo_processed_buffer, batch_size=None, training_length=sequence_length
    )
    return demo_processed_buffer
Ejemplo n.º 23
0
 def __init__(self, *args, **kwargs):
     super(RLTrainer, self).__init__(*args, **kwargs)
     # collected_rewards is a dictionary from name of reward signal to a dictionary of agent_id to cumulative reward
     # used for reporting only. We always want to report the environment reward to Tensorboard, regardless
     # of what reward signals are actually present.
     self.cumulative_returns_since_policy_update: List[float] = []
     self.collected_rewards: Dict[str, Dict[str, int]] = {
         "environment": defaultdict(lambda: 0)
     }
     self.update_buffer: AgentBuffer = AgentBuffer()
     self._stats_reporter.add_property(StatsPropertyType.HYPERPARAMETERS,
                                       self.trainer_settings.as_dict())
Ejemplo n.º 24
0
    def to_agentbuffer(self) -> AgentBuffer:
        """
        Converts a Trajectory to an AgentBuffer
        :param trajectory: A Trajectory
        :returns: AgentBuffer. Note that the length of the AgentBuffer will be one
        less than the trajectory, as the next observation need to be populated from the last
        step of the trajectory.
        """
        agent_buffer_trajectory = AgentBuffer()
        vec_vis_obs = SplitObservations.from_observations(self.steps[0].obs)
        for step, exp in enumerate(self.steps):
            if step < len(self.steps) - 1:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.steps[step + 1].obs)
            else:
                next_vec_vis_obs = SplitObservations.from_observations(
                    self.next_obs)

            for i, _ in enumerate(vec_vis_obs.visual_observations):
                agent_buffer_trajectory["visual_obs%d" % i].append(
                    vec_vis_obs.visual_observations[i])
                agent_buffer_trajectory["next_visual_obs%d" % i].append(
                    next_vec_vis_obs.visual_observations[i])
            agent_buffer_trajectory["vector_obs"].append(
                vec_vis_obs.vector_observations)
            agent_buffer_trajectory["next_vector_in"].append(
                next_vec_vis_obs.vector_observations)
            if exp.memory is not None:
                agent_buffer_trajectory["memory"].append(exp.memory)

            agent_buffer_trajectory["masks"].append(1.0)
            agent_buffer_trajectory["done"].append(exp.done)
            # Add the outputs of the last eval
            if exp.action_pre is not None:
                actions_pre = exp.action_pre
                agent_buffer_trajectory["actions_pre"].append(actions_pre)

            # value is a dictionary from name of reward to value estimate of the value head
            agent_buffer_trajectory["actions"].append(exp.action)
            agent_buffer_trajectory["action_probs"].append(exp.action_probs)

            # Store action masks if necessary. Eventually these will be
            # None for continuous actions
            if exp.action_mask is not None:
                agent_buffer_trajectory["action_mask"].append(exp.action_mask,
                                                              padding_value=1)

            agent_buffer_trajectory["prev_action"].append(exp.prev_action)
            agent_buffer_trajectory["environment_rewards"].append(exp.reward)

            # Store the next visual obs as the current
            vec_vis_obs = next_vec_vis_obs
        return agent_buffer_trajectory
Ejemplo n.º 25
0
def test_buffer_truncate():
    b = construct_fake_processing_buffer()
    update_buffer = AgentBuffer()
    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    # Test non-LSTM
    update_buffer.truncate(2)
    assert update_buffer.num_experiences == 2

    b.append_to_update_buffer(update_buffer,
                              3,
                              batch_size=None,
                              training_length=2)
    b.append_to_update_buffer(update_buffer,
                              2,
                              batch_size=None,
                              training_length=2)
    # Test LSTM, truncate should be some multiple of sequence_length
    update_buffer.truncate(4, sequence_length=3)
    assert update_buffer.num_experiences == 3
Ejemplo n.º 26
0
def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b["vector_observation"].append([
            100 * fake_agent_id + 10 * step + 1,
            100 * fake_agent_id + 10 * step + 2,
            100 * fake_agent_id + 10 * step + 3,
        ])
        b["action"].append([
            100 * fake_agent_id + 10 * step + 4,
            100 * fake_agent_id + 10 * step + 5
        ])
    return b
Ejemplo n.º 27
0
def construct_fake_buffer(fake_agent_id):
    b = AgentBuffer()
    for step in range(9):
        b[ObsUtil.get_name_at(0)].append([
            100 * fake_agent_id + 10 * step + 1,
            100 * fake_agent_id + 10 * step + 2,
            100 * fake_agent_id + 10 * step + 3,
        ])
        b[BufferKey.CONTINUOUS_ACTION].append([
            100 * fake_agent_id + 10 * step + 4,
            100 * fake_agent_id + 10 * step + 5
        ])
    return b
Ejemplo n.º 28
0
 def evaluate_batch(self, mini_batch: AgentBuffer) -> RewardSignalResult:
     """
     Evaluates the reward for the data present in the Dict mini_batch. Use this when evaluating a reward
     function drawn straight from a Buffer.
     :param mini_batch: A Dict of numpy arrays (the format used by our Buffer)
         when drawing from the update buffer.
     :return: a RewardSignalResult of (scaled intrinsic reward, unscaled intrinsic reward) provided by the generator
     """
     mini_batch_len = len(next(iter(mini_batch.values())))
     return RewardSignalResult(
         self.strength * np.zeros(mini_batch_len, dtype=np.float32),
         np.zeros(mini_batch_len, dtype=np.float32),
     )
Ejemplo n.º 29
0
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8):
    buffer = ProcessingBuffer()
    update_buffer = AgentBuffer()
    # Make a buffer
    for idx, experience in enumerate(brain_infos):
        if idx > len(brain_infos) - 2:
            break
        current_brain_info = brain_infos[idx]
        next_brain_info = brain_infos[idx + 1]
        buffer[0].last_brain_info = current_brain_info
        buffer[0]["done"].append(next_brain_info.local_done[0])
        buffer[0]["rewards"].append(next_brain_info.rewards[0])
        for i in range(brain_params.number_visual_observations):
            buffer[0]["visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
            buffer[0]["next_visual_obs%d" % i].append(
                current_brain_info.visual_observations[i][0]
            )
        if brain_params.vector_observation_space_size > 0:
            buffer[0]["vector_obs"].append(current_brain_info.vector_observations[0])
            buffer[0]["next_vector_in"].append(
                current_brain_info.vector_observations[0]
            )
        fake_action_size = len(brain_params.vector_action_space_size)
        if brain_params.vector_action_space_type == "continuous":
            fake_action_size = brain_params.vector_action_space_size[0]
        buffer[0]["actions"].append(np.zeros(fake_action_size, dtype=np.float32))
        buffer[0]["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32))
        buffer[0]["masks"].append(1.0)
        buffer[0]["advantages"].append(1.0)
        if brain_params.vector_action_space_type == "discrete":
            buffer[0]["action_probs"].append(
                np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32)
            )
        else:
            buffer[0]["action_probs"].append(
                np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
            )
        buffer[0]["actions_pre"].append(
            np.ones(buffer[0]["actions"][0].shape, dtype=np.float32)
        )
        buffer[0]["action_mask"].append(
            np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32)
        )
        buffer[0]["memory"].append(np.ones(memory_size, dtype=np.float32))

    buffer.append_to_update_buffer(
        update_buffer, 0, batch_size=None, training_length=sequence_length
    )
    return update_buffer
Ejemplo n.º 30
0
def test_num_experiences():
    agent_1_buffer = construct_fake_buffer(1)
    agent_2_buffer = construct_fake_buffer(2)
    update_buffer = AgentBuffer()

    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 0
    assert update_buffer.num_experiences == 0
    agent_1_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)
    agent_2_buffer.resequence_and_append(update_buffer,
                                         batch_size=None,
                                         training_length=2)

    assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20
    assert update_buffer.num_experiences == 20