Esempio n. 1
0
def test_agentprocessor(num_vis_obs):
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )

    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1], [0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])),
    }
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=2,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])),
        value=[0.1, 0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_steps.agent_id,
    )
    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    for _ in range(5):
        processor.add_experiences(
            mock_decision_steps, mock_terminal_steps, 0, fake_action_info
        )

    # Assert that two trajectories have been added to the Trainer
    assert len(tqueue.put.call_args_list) == 2

    # Assert that the trajectory is of length 5
    trajectory = tqueue.put.call_args_list[0][0][0]
    assert len(trajectory.steps) == 5

    # Assert that the AgentProcessor is empty
    assert len(processor.experience_buffers[0]) == 0

    # Test empty steps
    mock_decision_steps, mock_terminal_steps = mb.create_mock_steps(
        num_agents=0,
        observation_shapes=[(8,)] + num_vis_obs * [(84, 84, 3)],
        action_spec=ActionSpec.create_continuous(2),
    )
    processor.add_experiences(
        mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()
    )
    # Assert that the AgentProcessor is still empty
    assert len(processor.experience_buffers[0]) == 0
Esempio n. 2
0
def test_end_episode():
    policy = create_mock_policy()
    tqueue = mock.Mock()
    name_behavior_id = "test_brain_name"
    processor = AgentProcessor(
        policy,
        name_behavior_id,
        max_trajectory_length=5,
        stats_reporter=StatsReporter("testcat"),
    )
    fake_action_outputs = {
        "action": ActionTuple(continuous=np.array([[0.1]])),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(continuous=np.array([[0.1]])),
    }

    mock_decision_step, mock_terminal_step = mb.create_mock_steps(
        num_agents=1,
        observation_shapes=[(8,)],
        action_spec=ActionSpec.create_continuous(2),
    )
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]])),
        env_action=ActionTuple(continuous=np.array([[0.1]])),
        value=[0.1],
        outputs=fake_action_outputs,
        agent_ids=mock_decision_step.agent_id,
    )

    processor.publish_trajectory_queue(tqueue)
    # This is like the initial state after the env reset
    processor.add_experiences(
        mock_decision_step, mock_terminal_step, 0, ActionInfo.empty()
    )
    # Run 3 trajectories, with different workers (to simulate different agents)
    remove_calls = []
    for _ep in range(3):
        remove_calls.append(mock.call([get_global_agent_id(_ep, 0)]))
        for _ in range(5):
            processor.add_experiences(
                mock_decision_step, mock_terminal_step, _ep, fake_action_info
            )
            # Make sure we don't add experiences from the prior agents after the done

    # Call end episode
    processor.end_episode()
    # Check that we removed every agent
    policy.remove_previous_action.assert_has_calls(remove_calls)
    # Check that there are no experiences left
    assert len(processor.experience_buffers.keys()) == 0
    assert len(processor.last_take_action_outputs.keys()) == 0
    assert len(processor.episode_steps.keys()) == 0
    assert len(processor.episode_rewards.keys()) == 0
Esempio n. 3
0
    def run(self):
        # reset unity environment before start
        self.unity_env.reset()

        while True:
            decision_steps, terminal_steps = self.unity_env.get_steps(
                self._behavior_name)
            # first process all envs/"agents" in decision steps
            actions = []
            # process all envs/"agents" that finished their episodes in terminal
            # steps
            for agent_id in terminal_steps.agent_id:
                # first check if a new episode needs to be started
                if agent_id not in self.agentID_to_episodeID.keys():
                    episode_id = self.start_episode()
                    self.agentID_to_episodeID[agent_id] = episode_id
                episode_id = self.agentID_to_episodeID[agent_id]
                # get observation, rewards and info
                obs = terminal_steps[agent_id].obs
                obs = obs[0] if len(obs) == 1 else obs
                reward = terminal_steps[agent_id].reward
                info = {"interrupted": terminal_steps[agent_id].interrupted}
                self.log_returns(episode_id, reward, info)
                # end episode and remove agent_id from self.agentID_to_episodeID
                self.end_episode(episode_id, obs)
                self.agentID_to_episodeID.pop(agent_id)

            for agent_id in decision_steps.agent_id:
                # first check if a new episode needs to be started
                if agent_id not in self.agentID_to_episodeID.keys():
                    episode_id = self.start_episode()
                    self.agentID_to_episodeID[agent_id] = episode_id
                episode_id = self.agentID_to_episodeID[agent_id]
                # get observation and reward and request action
                obs = decision_steps[agent_id].obs
                obs = obs[0] if len(obs) == 1 else obs
                reward = decision_steps[agent_id].reward
                # log reward and request action
                self.log_returns(episode_id, reward)
                actions.append(self.get_action(episode_id, obs))
            # set actions in Unity environment
            if actions:
                if actions[0].dtype == np.float32:
                    action_tuple = ActionTuple(continuous=np.array(actions))
                else:
                    action_tuple = ActionTuple(discrete=np.array(actions))
            self.unity_env.set_actions(self._behavior_name, action_tuple)

            self.unity_env.step()
def evaluate_population(pop, env, net_shapes, inference=False):
    """
    Evaluation of a whole population 
    """
    behavior_name = list(env.behavior_specs)[0]
    spec = env.behavior_specs[behavior_name]
    action_spec = spec.action_spec

    fitnesses = [0.] * len(pop)
    env.reset()

    decision_steps, terminal_steps = env.get_steps(behavior_name)

    # Play in an environment for number of steps
    episode_length = 1000
    while episode_length > 1:
        for agent_id in terminal_steps.agent_id:
            fitnesses[agent_id] += terminal_steps[agent_id].reward
        
        if len(terminal_steps) == len(pop):
            break
        # Generate an action for all agents
        actions = np.empty( (len(decision_steps), action_spec.continuous_size) )
        for agent_id in decision_steps.agent_id:
            
            state =  decision_steps[agent_id].obs[0] 
            fitnesses[agent_id] += decision_steps[agent_id].reward
            individual = params_reshape(net_shapes, pop[agent_id])
            if action_spec.is_discrete():
                action_discrete_probs = get_action(individual, state) # returns probability for each action
                actions_discrete = np.argmax(action_discrete_probs) # choose action with highest probability
                actions[agent_id] = actions_discrete
            elif action_spec.is_continuous():
                action_continuous_probs = get_action(individual, state, continuous=True) # returns probability for each action
                actions[agent_id] = action_continuous_probs
        
        if episode_length % 251 == 0 and not inference:
            env.reset() # reset to change initial position from time to time
        else:
            if action_spec.is_discrete():
                action = ActionTuple(discrete=actions)
            elif action_spec.is_continuous():
                action = ActionTuple(continuous=actions)
            env.set_actions(behavior_name, action)
            env.step()
        decision_steps, terminal_steps = env.get_steps(behavior_name)
        episode_length -= 1
    return fitnesses
Esempio n. 5
0
 def solve(self) -> None:
     self.reset()
     for _ in range(self.n_demos):
         for name in self.names:
             if self.action_spec.discrete_size > 0:
                 self.action[name] = ActionTuple(
                     np.array([], dtype=np.float32),
                     np.array([[1]] if self.goal[name] > 0 else [[0]],
                              dtype=np.int32),
                 )
             else:
                 self.action[name] = ActionTuple(
                     np.array([[float(self.goal[name])]], dtype=np.float32),
                     np.array([], dtype=np.int32),
                 )
         self.step()
Esempio n. 6
0
def test_step(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    spec = env.behavior_specs["RealFakeBrain"]
    env.step()
    decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
    n_agents = len(decision_steps)
    env.set_actions("RealFakeBrain", spec.action_spec.empty_action(n_agents))
    env.step()
    with pytest.raises(UnityActionException):
        env.set_actions("RealFakeBrain",
                        spec.action_spec.empty_action(n_agents - 1))
    decision_steps, terminal_steps = env.get_steps("RealFakeBrain")
    n_agents = len(decision_steps)
    _empty_act = spec.action_spec.empty_action(n_agents)
    next_action = ActionTuple(_empty_act.continuous - 1,
                              _empty_act.discrete - 1)
    env.set_actions("RealFakeBrain", next_action)
    env.step()

    env.close()
    assert isinstance(decision_steps, DecisionSteps)
    assert isinstance(terminal_steps, TerminalSteps)
    assert len(spec.observation_specs) == len(decision_steps.obs)
    assert len(spec.observation_specs) == len(terminal_steps.obs)
    for spec, obs in zip(spec.observation_specs, decision_steps.obs):
        assert (n_agents, ) + spec.shape == obs.shape
    assert 0 in decision_steps
    assert 2 in terminal_steps
def _create_action_info(num_agents: int, agent_ids: List[str]) -> ActionInfo:
    fake_action_outputs = {
        "action": ActionTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
        "entropy": np.array([1.0], dtype=np.float32),
        "learning_rate": 1.0,
        "log_probs": LogProbsTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
    }
    fake_action_info = ActionInfo(
        action=ActionTuple(continuous=np.array([[0.1]] * num_agents, dtype=np.float32)),
        env_action=ActionTuple(
            continuous=np.array([[0.1]] * num_agents, dtype=np.float32)
        ),
        outputs=fake_action_outputs,
        agent_ids=agent_ids,
    )
    return fake_action_info
Esempio n. 8
0
    def ConvertList2DiscreteAction(self, arr, behavior_name):
        '''
        input data type = list or 1D array -> ex)[3]
                !!! Don't Input 2D Array or list like [(0, 2)]
        output data type = Actiontuple
        '''
        actionList = []
        actionList.append(arr)
        _discrete = np.array(actionList, dtype=np.int32)
        action = ActionTuple(discrete=_discrete)

        return action
Esempio n. 9
0
def test_set_action_multi_agent():
    engine_config_channel = EngineConfigurationChannel()
    env = default_registry[BALL_ID].make(
        base_port=6001,
        worker_id=0,
        no_graphics=True,
        side_channels=[engine_config_channel],
    )
    engine_config_channel.set_configuration_parameters(time_scale=100)
    for _ in range(3):
        env.reset()
        behavior_name = list(env.behavior_specs.keys())[0]
        d, t = env.get_steps(behavior_name)
        for _ in range(50):
            action = np.ones((len(d), 2))
            action_tuple = ActionTuple()
            action_tuple.add_continuous(action)
            env.set_actions(behavior_name, action_tuple)
            env.step()
            d, t = env.get_steps(behavior_name)
    env.close()
Esempio n. 10
0
    def step(self, action):
        # Reshape to (10, 9) as needed for the wrapper
        action = action.reshape((10, 9))
        act = ActionTuple(action)
        self.env.set_actions(self.behavior_name, act)

        self.env.step()
        decision_steps, terminal_steps = self.env.get_steps(self.behavior_name)
        observation, reward = self._decision_to_observation(decision_steps)
        done = len(decision_steps) == 0
        info = {}
        return observation, reward, done, info
Esempio n. 11
0
    def step(self, action):
        """Runs one timestep of the environment"s dynamics.
        Once an episode is done, reset() has to be called manually.
                
        Arguments:
            action {List} -- A list of at least one discrete action to be executed by the agent

        Returns:
            {numpy.ndarray} -- Visual observation
            {numpy.ndarray} -- Vector observation
            {float} -- (Total) Scalar reward signaled by the environment
            {bool} -- Whether the episode of the environment terminated
            {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed
        """
        # Carry out the agent's action
        action_tuple = ActionTuple()
        action_tuple.add_discrete(np.asarray(action).reshape([1, -1]))
        self._env.set_actions(self._behavior_name, action_tuple)
        self._env.step()
        info, terminal_info = self._env.get_steps(self._behavior_name)

        # Process step results
        vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info)
        self._rewards.append(reward)

        # Record trajectory data
        if self._record:
            self._trajectory["vis_obs"].append(vis_obs * 255)
            self._trajectory["vec_obs"].append(vec_obs)
            self._trajectory["rewards"].append(reward)
            self._trajectory["actions"].append(action)

        # Episode information
        if done:
            info = {"reward": sum(self._rewards),
                    "length": len(self._rewards)}
        else:
            info = None

        return vis_obs, vec_obs, reward, done, info
Esempio n. 12
0
    def step(self, action: List[Any]) -> GymStepResult:
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information.
        """
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        action = np.array(action).reshape((1, self.action_size))

        action_tuple = ActionTuple()
        if self.group_spec.action_spec.is_continuous():
            action_tuple.add_continuous(action)
        else:
            action_tuple.add_discrete(action)
        self._env.set_actions(self.name, action_tuple)

        self._env.step()
        decision_step, terminal_step = self._env.get_steps(self.name)
        self._check_agents(max(len(decision_step), len(terminal_step)))
        if len(terminal_step) != 0:
            # The agent is done
            self.game_over = True
            return self._single_step(terminal_step)
        else:
            return self._single_step(decision_step)
Esempio n. 13
0
    def step(
        self, actions: List[np.ndarray]
    ) -> GymResult:  # todo add support for ActionTuple(continuous, discrete)
        curr_action_idx = 0
        for team in self.team_names:
            # print(f'start idx={curr_action_idx}. End idx = {curr_action_idx + self.agent_per_team[team]}')
            # print(f'len action list:{len(actions[curr_action_idx:curr_action_idx + self.agent_per_team[team]])}')
            action = np.vstack(actions[curr_action_idx:curr_action_idx +
                                       self.agent_per_team[team]])
            # print(f'actions shape:{action.shape}')
            self._e.set_actions(team, ActionTuple(action))
            curr_action_idx += self.agent_per_team[team]

        self._e.step()
        self.n += 1

        return self.collect_obs()
Esempio n. 14
0
 def to_action_tuple(self, clip: bool = False) -> ActionTuple:
     """
     Returns an ActionTuple
     """
     action_tuple = ActionTuple()
     if self.continuous_tensor is not None:
         _continuous_tensor = self.continuous_tensor
         if clip:
             _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3
         continuous = ModelUtils.to_numpy(_continuous_tensor)
         action_tuple.add_continuous(continuous)
     if self.discrete_list is not None:
         discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
         action_tuple.add_discrete(discrete)
     return action_tuple
Esempio n. 15
0
 def _add_group_status_and_obs(
     self, step: Union[TerminalStep, DecisionStep], worker_id: int
 ) -> None:
     """
     Takes a TerminalStep or DecisionStep and adds the information in it
     to self.group_status. This information can then be retrieved
     when constructing trajectories to get the status of group mates. Also stores the current
     observation into current_group_obs, to be used to get the next group observations
     for bootstrapping.
     :param step: TerminalStep or DecisionStep
     :param worker_id: Worker ID of this particular environment. Used to generate a
         global group id.
     """
     global_agent_id = get_global_agent_id(worker_id, step.agent_id)
     stored_decision_step, idx = self._last_step_result.get(
         global_agent_id, (None, None)
     )
     stored_take_action_outputs = self._last_take_action_outputs.get(
         global_agent_id, None
     )
     if stored_decision_step is not None and stored_take_action_outputs is not None:
         # 0, the default group_id, means that the agent doesn't belong to an agent group.
         # If 0, don't add any groupmate information.
         if step.group_id > 0:
             global_group_id = get_global_group_id(worker_id, step.group_id)
             stored_actions = stored_take_action_outputs["action"]
             action_tuple = ActionTuple(
                 continuous=stored_actions.continuous[idx],
                 discrete=stored_actions.discrete[idx],
             )
             group_status = AgentStatus(
                 obs=stored_decision_step.obs,
                 reward=step.reward,
                 action=action_tuple,
                 done=isinstance(step, TerminalStep),
             )
             self._group_status[global_group_id][global_agent_id] = group_status
             self._current_group_obs[global_group_id][global_agent_id] = step.obs
Esempio n. 16
0
 def set_actions(self, behavior_name, action):
     # The ActionTuple contains the actions for all n_agents. This
     # slices the ActionTuple into an action tuple for each environment
     # and sets it. The index j is used to ignore agents that have already
     # reached done.
     j = 0
     for i in range(self.num_agents):
         _act = ActionTuple()
         name_and_num = behavior_name + str(i)
         env = self.envs[name_and_num]
         if not self.dones[name_and_num]:
             if self.action_spec.continuous_size > 0:
                 _act.add_continuous(action.continuous[j:j + 1])
             if self.action_spec.discrete_size > 0:
                 _disc_list = [action.discrete[j, :]]
                 _act.add_discrete(np.array(_disc_list))
             j += 1
             env.action[behavior_name] = _act
Esempio n. 17
0
    def step(self, action: List[Any]) -> GymStepResult:
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object/list): an action provided by the environment
        Returns:
            observation (object/list): agent's observation of the current environment
            reward (float/list) : amount of reward returned after previous action
            done (boolean/list): whether the episode has ended.
            info (dict): contains auxiliary diagnostic information.
        """
        if self._flattener is not None:
            # Translate action into list
            action = self._flattener.lookup_action(action)

        action = np.array(action).reshape((-1, self.action_size))

        action_tuple = ActionTuple()
        if self.group_spec.action_spec.is_continuous():
            action_tuple.add_continuous(action)
        else:
            action_tuple.add_discrete(action)

        self._env.set_actions(self.name, action_tuple)
        self._env.step()

        decision_step, terminal_step = self._env.get_steps(self.name)

        try:
            return self.combine_steps(decision_step, terminal_step)
        except KeyError:
            self.key_error_counter += 1
            # print(f"{self.key_error_counter}th KeyError in UnityToMultiGymWrapper. Previous step returned.")
            return self.last_stepreturn
Esempio n. 18
0
Created on Fri May  7 11:06:32 2021

@author: Win10
"""

import numpy as np
import matplotlib.pyplot as plt
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.base_env import ActionTuple

# spdyer first, then unity play
print('load_enviroment, prepare to hit play button!')
env = UnityEnvironment(file_name=None, side_channels=[])
# Start interacting with the evironment.
env.reset()

behavior_name = list(env.behavior_specs)[0]
spec = env.behavior_specs[behavior_name]

#vis_obs = any(len(spec.shape) == 3 for spec in spec.observation_specs)
decision_steps, terminal_steps = env.get_steps(behavior_name)
action = ActionTuple(np.array([[1.0, 1.0]], dtype=np.float32))

action = spec.action_spec.random_action(len(decision_steps))
env.set_actions(behavior_name, action)

env.step()

decision_steps2, terminal_steps2 = env.get_steps(behavior_name)
env.close()
Esempio n. 19
0
def make_fake_trajectory(
    length: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
    num_other_agents_in_group: int = 0,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    for _i in range(length - 1):
        obs = []
        for obs_spec in observation_specs:
            obs.append(np.ones(obs_spec.shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = ActionTuple(
            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
        )
        action_probs = LogProbsTuple(
            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
        )
        action_mask = (
            [
                [False for _ in range(branch)]
                for branch in action_spec.discrete_branches
            ]  # type: ignore
            if action_spec.is_discrete()
            else None
        )
        if action_spec.is_discrete():
            prev_action = np.ones(action_size, dtype=np.int32)
        else:
            prev_action = np.ones(action_size, dtype=np.float32)

        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        group_status = []
        for _ in range(num_other_agents_in_group):
            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
            group_status=group_status,
            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
    for obs_spec in observation_specs:
        obs.append(np.ones(obs_spec.shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
        group_status=group_status,
        group_reward=0,
    )
    steps_list.append(last_experience)
    return Trajectory(
        steps=steps_list,
        agent_id=agent_id,
        behavior_id=behavior_id,
        next_obs=obs,
        next_group_obs=[obs] * num_other_agents_in_group,
    )
Esempio n. 20
0
    def get_action(self,
                   decision_requests: DecisionSteps,
                   worker_id: int = 0) -> ActionInfo:
        """
        Decides actions given observations information, and takes them in environment.
        :param decision_requests: A dictionary of brain names and DecisionSteps from environment.
        :param worker_id: In parallel environment training, the unique id of the environment worker that
            the DecisionSteps came from. Used to construct a globally unique id for each agent.
        :return: an ActionInfo containing action, memories, values and an object
        to be passed to add experiences
        """
        if len(decision_requests) == 0:
            return ActionInfo.empty()

        global_agent_ids = [
            get_global_agent_id(worker_id, int(agent_id))
            for agent_id in decision_requests.agent_id
        ]  # For 1-D array, the iterator order is correct.

        run_out = self.evaluate(  # pylint: disable=assignment-from-no-return
            decision_requests, global_agent_ids)

        self.save_memories(global_agent_ids, run_out.get("memory_out"))
        # For Compatibility with buffer changes for hybrid action support
        if "log_probs" in run_out:
            log_probs_tuple = LogProbsTuple()
            if self.behavior_spec.action_spec.is_continuous():
                log_probs_tuple.add_continuous(run_out["log_probs"])
            else:
                log_probs_tuple.add_discrete(run_out["log_probs"])
            run_out["log_probs"] = log_probs_tuple
        if "action" in run_out:
            action_tuple = ActionTuple()
            env_action_tuple = ActionTuple()
            if self.behavior_spec.action_spec.is_continuous():
                action_tuple.add_continuous(run_out["pre_action"])
                env_action_tuple.add_continuous(run_out["action"])
            else:
                action_tuple.add_discrete(run_out["action"])
                env_action_tuple.add_discrete(run_out["action"])
            run_out["action"] = action_tuple
            run_out["env_action"] = env_action_tuple
        self.check_nan_action(run_out.get("action"))
        return ActionInfo(
            action=run_out.get("action"),
            env_action=run_out.get("env_action"),
            value=run_out.get("value"),
            outputs=run_out,
            agent_ids=decision_requests.agent_id,
        )
Esempio n. 21
0
def stepDiscreteAction(behavior_name, arrlist):
    _discrete = np.array(arrlist, dtype=np.int32)
    action = ActionTuple(discrete=_discrete)
    env.set_actions(behavior_name, action)
    env.step()
Esempio n. 22
0
    print("Is there a visual observation ?", vis_obs_bool)

    #print action is_discrete
    print("Is action is discrete ?", spec.action_spec.is_discrete())

    #print action is_continuous
    print("Is action is continus ?", spec.action_spec.is_continuous())

    #make continuous action and discrete action with 0 array, and step!
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    n_agents = len(decision_steps)
    _continuous = np.zeros((n_agents, spec.action_spec.continuous_size),
                           dtype=np.float32)
    _discrete = np.zeros((n_agents, spec.action_spec.discrete_size),
                         dtype=np.int32)
    action = ActionTuple(continuous=_continuous, discrete=_discrete)
    env.set_actions(behavior_name, action)
    env.step()

    #make custom discrete action, and step!
    actionarr = [[1]]  #list shape(1<num_agents>,1<discrete_size>)
    _discrete = np.array(actionarr, dtype=np.int32)
    action = ActionTuple(discrete=_discrete)
    env.set_actions(behavior_name, action)
    env.step()

    #Get step information to get observation
    decision_steps, terminal_steps = env.get_steps(behavior_name)
    #visual observation
    for index, shape in enumerate(spec.observation_shapes):
        if len(shape) == 3:
Esempio n. 23
0
    def _process_step(
        self, step: Union[TerminalStep, DecisionStep], worker_id: int, index: int
    ) -> None:
        terminated = isinstance(step, TerminalStep)
        global_agent_id = get_global_agent_id(worker_id, step.agent_id)
        global_group_id = get_global_group_id(worker_id, step.group_id)
        stored_decision_step, idx = self._last_step_result.get(
            global_agent_id, (None, None)
        )
        stored_take_action_outputs = self._last_take_action_outputs.get(
            global_agent_id, None
        )
        if not terminated:
            # Index is needed to grab from last_take_action_outputs
            self._last_step_result[global_agent_id] = (step, index)

        # This state is the consequence of a past action
        if stored_decision_step is not None and stored_take_action_outputs is not None:
            obs = stored_decision_step.obs
            if self.policy.use_recurrent:
                memory = self.policy.retrieve_previous_memories([global_agent_id])[0, :]
            else:
                memory = None
            done = terminated  # Since this is an ongoing step
            interrupted = step.interrupted if terminated else False
            # Add the outputs of the last eval
            stored_actions = stored_take_action_outputs["action"]
            action_tuple = ActionTuple(
                continuous=stored_actions.continuous[idx],
                discrete=stored_actions.discrete[idx],
            )
            stored_action_probs = stored_take_action_outputs["log_probs"]
            log_probs_tuple = LogProbsTuple(
                continuous=stored_action_probs.continuous[idx],
                discrete=stored_action_probs.discrete[idx],
            )
            action_mask = stored_decision_step.action_mask
            prev_action = self.policy.retrieve_previous_action([global_agent_id])[0, :]

            # Assemble teammate_obs. If none saved, then it will be an empty list.
            group_statuses = []
            for _id, _mate_status in self._group_status[global_group_id].items():
                if _id != global_agent_id:
                    group_statuses.append(_mate_status)

            experience = AgentExperience(
                obs=obs,
                reward=step.reward,
                done=done,
                action=action_tuple,
                action_probs=log_probs_tuple,
                action_mask=action_mask,
                prev_action=prev_action,
                interrupted=interrupted,
                memory=memory,
                group_status=group_statuses,
                group_reward=step.group_reward,
            )
            # Add the value outputs if needed
            self._experience_buffers[global_agent_id].append(experience)
            self._episode_rewards[global_agent_id] += step.reward
            if not terminated:
                self._episode_steps[global_agent_id] += 1

            # Add a trajectory segment to the buffer if terminal or the length has reached the time horizon
            if (
                len(self._experience_buffers[global_agent_id])
                >= self._max_trajectory_length
                or terminated
            ):
                next_obs = step.obs
                next_group_obs = []
                for _id, _obs in self._current_group_obs[global_group_id].items():
                    if _id != global_agent_id:
                        next_group_obs.append(_obs)

                trajectory = Trajectory(
                    steps=self._experience_buffers[global_agent_id],
                    agent_id=global_agent_id,
                    next_obs=next_obs,
                    next_group_obs=next_group_obs,
                    behavior_id=self._behavior_id,
                )
                for traj_queue in self._trajectory_queues:
                    traj_queue.put(trajectory)
                self._experience_buffers[global_agent_id] = []
            if terminated:
                # Record episode length.
                self._stats_reporter.add_stat(
                    "Environment/Episode Length",
                    self._episode_steps.get(global_agent_id, 0),
                )
                self._clean_agent_data(global_agent_id)
Esempio n. 24
0
    def step(
        self, action_dict: MultiAgentDict
    ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]:
        """Performs one multi-agent step through the game.

        Args:
            action_dict (dict): Multi-agent action dict with:
                keys=agent identifier consisting of
                [MLagents behavior name, e.g. "Goalie?team=1"] + "_" +
                [Agent index, a unique MLAgent-assigned index per single agent]

        Returns:
            tuple:
                - obs: Multi-agent observation dict.
                    Only those observations for which to get new actions are
                    returned.
                - rewards: Rewards dict matching `obs`.
                - dones: Done dict with only an __all__ multi-agent entry in
                    it. __all__=True, if episode is done for all agents.
                - infos: An (empty) info dict.
        """
        from mlagents_envs.base_env import ActionTuple

        # Set only the required actions (from the DecisionSteps) in Unity3D.
        all_agents = []
        for behavior_name in self.unity_env.behavior_specs:
            # New ML-Agents API: Set all agents actions at the same time
            # via an ActionTuple. Since API v1.4.0.
            if self.api_version[0] > 1 or (self.api_version[0] == 1
                                           and self.api_version[1] >= 4):
                actions = []
                for agent_id in self.unity_env.get_steps(
                        behavior_name)[0].agent_id:
                    key = behavior_name + "_{}".format(agent_id)
                    all_agents.append(key)
                    actions.append(action_dict[key])
                if actions:
                    if actions[0].dtype == np.float32:
                        action_tuple = ActionTuple(
                            continuous=np.array(actions))
                    else:
                        action_tuple = ActionTuple(discrete=np.array(actions))
                    self.unity_env.set_actions(behavior_name, action_tuple)
            # Old behavior: Do not use an ActionTuple and set each agent's
            # action individually.
            else:
                for agent_id in self.unity_env.get_steps(
                        behavior_name)[0].agent_id_to_index.keys():
                    key = behavior_name + "_{}".format(agent_id)
                    all_agents.append(key)
                    self.unity_env.set_action_for_agent(
                        behavior_name, agent_id, action_dict[key])
        # Do the step.
        self.unity_env.step()

        obs, rewards, dones, infos = self._get_step_results()

        # Global horizon reached? -> Return __all__ done=True, so user
        # can reset. Set all agents' individual `done` to True as well.
        self.episode_timesteps += 1
        if self.episode_timesteps > self.episode_horizon:
            return obs, rewards, dict({"__all__": True}, **{
                agent_id: True
                for agent_id in all_agents
            }), infos

        return obs, rewards, dones, infos