Example #1
0
    def sample(self):
        v, w = SPACE_LOCAL_RANDOM_STATE.uniform(
            low=self.low,
            high=self.high + (0 if np.dtype(self.dtype).kind == 'f' else 1),
            size=self.low.shape).astype(self.dtype)

        return Action(command=np.array([v, w]))
Example #2
0
def record_take(model, env_instance, device, debug=False):
    """run one rollout of the rl model with the environment, until done is true
    :param model: rl policy model
    :param env_instance: an instance of the environment to be evaluated
    :param device: cpu or gpu
    :param debug: debug mode has gui output
    :return: some basic metric info of this rollout
    """
    frames = []
    steps = 0
    rewards = 0
    observation = env_instance.reset()

    print("Evaluating environment...")

    while True:
        observation_tensor = _dict_to_tensor(observation, device)
        if isinstance(model, PolicyGradientModel):
            actions = model.step(
                observation_tensor,
                argmax_sampling=False)['actions'].to(device)[0]
        elif isinstance(model, DeterministicPolicyModel):
            actions = model.step(observation_tensor)['actions'].to(device)[0]
        else:
            raise NotImplementedError
        action_class = Action(command=actions.cpu().numpy())
        observation, reward, done, epinfo = env_instance.step(action_class)
        steps += 1
        rewards += reward
        if debug or device.type == 'cpu':
            frames.append(env_instance.render(mode='human'))

        if done:
            print("episode reward: {}, steps: {}".format(rewards, steps))
            return {'r': rewards, 'l': steps, 'frames': frames}
Example #3
0
 def step(self, action):
     """
     Take a step in the environment
     :param action: A numpy array, the same size as action dim
     :return: A tuple of obs,r,done,info returned by the environment
     """
     obs, r, done, info = self.env.step(Action(np.array(action)))
     return obs, r, done, info
Example #4
0
    def before_env_step(self):
        """
        Things you need to do before the environment step:
        Pass inputs to agent, get back actions and metactions
        """
        # render the picture & wait for the user key input on it
        key = self._display(self._img)
        v, w = self._interpret(key)

        # interpret the key as action
        action = np.zeros(2, dtype=np.float32)
        if v is not None:
            action[0] = 0.5 * v
        if w is not None:
            coeff = np.pi / 2.0 * 0.1
            self._front_wheel_steering_rotation_state = np.clip(
                self._front_wheel_steering_rotation_state + coeff * w,
                -np.pi / 2.0, np.pi / 2.0)
            action[1] = self._front_wheel_steering_rotation_state

        print(80 * '=')
        print(action)

        self._action = Action(command=action)
Example #5
0
""" Run a random mini environment with egocentric costmap observation wrapper. """
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import numpy as np
from bc_gym_planning_env.envs.base.action import Action
from bc_gym_planning_env.envs.mini_env import RandomMiniEnv
from bc_gym_planning_env.envs.egocentric import EgocentricCostmap


if __name__ == '__main__':
    for seed in range(1000):
        print(seed)

        env = RandomMiniEnv()
        env = EgocentricCostmap(env)

        env.seed(seed)
        env.reset()
        env.render()

        done = False

        while not done:
            action = Action(command=np.array([0.3, 0.0]))
            _, _, done, _ = env.step(action)
            env.render()
Example #6
0
    def rollout(self, batch_info, model):
        """ Calculate env rollout """
        observation_accumulator = {}  # Device tensors
        observation_accumulator['environment'] = []
        observation_accumulator['goal'] = []
        action_accumulator = []  # Device tensors
        value_accumulator = []  # Device tensors
        dones_accumulator = []  # Device tensors
        rewards_accumulator = []  # Device tensors
        episode_information = []  # Python objects
        logprobs_accumulator = []  # Device tensors

        if self.hidden_state is None and model.is_recurrent:
            self.hidden_state = torch.zeros(
                (self.last_observation.size(0), model.state_dim),
                device=self.device,
                dtype=torch.float32
            )

        # Remember rollout initial state, we'll use that for learning as well
        initial_hidden_state = self.hidden_state

        for step_idx in range(self.number_of_steps):
            if model.is_recurrent:
                step = model.step(self.last_observation, state=self.hidden_state)
                self.hidden_state = step['state']
            else:
                step = model.step(self.last_observation)

            actions, values, logprobs = step['actions'], step['values'], step['logprobs']

            if isinstance(self.last_observation, dict):
                observation_accumulator['environment'].append(self.last_observation['environment'])
                observation_accumulator['goal'].append(self.last_observation['goal'])
            else:
                observation_accumulator['environment'].append(self.last_observation)

            action_accumulator.append(actions)
            value_accumulator.append(values)
            logprobs_accumulator.append(logprobs)

            actions_numpy = actions.detach().cpu().numpy()
            if len(actions_numpy.shape) > 1:
                actions_numpy = actions_numpy[0]
            action_class = Action(command=actions_numpy)
            new_obs, new_rewards, new_dones, new_infos = self.environment.step(action_class)

            # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the
            # next episode

            dones_tensor = self._to_tensor(new_dones.astype(np.float32))
            dones_accumulator.append(dones_tensor)

            self.last_observation = self._dict_to_tensor(new_obs)

            if model.is_recurrent:
                # Zero out state in environments that have finished
                self.hidden_state = self.hidden_state * (1.0 - dones_tensor.unsqueeze(-1))

            rewards_accumulator.append(self._to_tensor(new_rewards.astype(np.float32)))

            episode_information.append(new_infos)

        if model.is_recurrent:
            final_values = model.value(self.last_observation, state=self.hidden_state)
        else:
            final_values = model.value(self.last_observation)

        if len(observation_accumulator['goal']) > 0:
            observations_buffer = {}
            observations_buffer['environment'] = torch.stack(observation_accumulator['environment'])
            observations_buffer['goal'] = torch.stack(observation_accumulator['goal'])
        else:
            observations_buffer = torch.stack(observation_accumulator['environment'])

        rewards_buffer = torch.stack(rewards_accumulator)
        actions_buffer = torch.stack(action_accumulator)  # Actions may have various different dtypes
        values_buffer = torch.stack(value_accumulator)
        dones_buffer = torch.stack(dones_accumulator)
        logprobs_buffer = torch.stack(logprobs_accumulator)

        # Generalized Advantage Estimation
        # https://arxiv.org/abs/1506.02438
        advantages = self.discount_bootstrap_gae(
            rewards_buffer, dones_buffer, values_buffer, final_values,
            self.discount_factor, self.gae_lambda
        )

        returns = advantages + values_buffer

        return Trajectories(
            num_steps=advantages.size(0),
            num_envs=advantages.size(1),
            environment_information=episode_information,
            transition_tensors={
                'observations': observations_buffer,
                'estimated_returns': returns,
                'dones': dones_buffer,
                'actions': actions_buffer,
                'estimated_values': values_buffer,
                'estimated_advantages': advantages,
                'action:logprobs': logprobs_buffer,
            },
            rollout_tensors={
                'initial_hidden_state': initial_hidden_state,
                'final_estimated_values': final_values
            }
        )