def sample(self): v, w = SPACE_LOCAL_RANDOM_STATE.uniform( low=self.low, high=self.high + (0 if np.dtype(self.dtype).kind == 'f' else 1), size=self.low.shape).astype(self.dtype) return Action(command=np.array([v, w]))
def record_take(model, env_instance, device, debug=False): """run one rollout of the rl model with the environment, until done is true :param model: rl policy model :param env_instance: an instance of the environment to be evaluated :param device: cpu or gpu :param debug: debug mode has gui output :return: some basic metric info of this rollout """ frames = [] steps = 0 rewards = 0 observation = env_instance.reset() print("Evaluating environment...") while True: observation_tensor = _dict_to_tensor(observation, device) if isinstance(model, PolicyGradientModel): actions = model.step( observation_tensor, argmax_sampling=False)['actions'].to(device)[0] elif isinstance(model, DeterministicPolicyModel): actions = model.step(observation_tensor)['actions'].to(device)[0] else: raise NotImplementedError action_class = Action(command=actions.cpu().numpy()) observation, reward, done, epinfo = env_instance.step(action_class) steps += 1 rewards += reward if debug or device.type == 'cpu': frames.append(env_instance.render(mode='human')) if done: print("episode reward: {}, steps: {}".format(rewards, steps)) return {'r': rewards, 'l': steps, 'frames': frames}
def step(self, action): """ Take a step in the environment :param action: A numpy array, the same size as action dim :return: A tuple of obs,r,done,info returned by the environment """ obs, r, done, info = self.env.step(Action(np.array(action))) return obs, r, done, info
def before_env_step(self): """ Things you need to do before the environment step: Pass inputs to agent, get back actions and metactions """ # render the picture & wait for the user key input on it key = self._display(self._img) v, w = self._interpret(key) # interpret the key as action action = np.zeros(2, dtype=np.float32) if v is not None: action[0] = 0.5 * v if w is not None: coeff = np.pi / 2.0 * 0.1 self._front_wheel_steering_rotation_state = np.clip( self._front_wheel_steering_rotation_state + coeff * w, -np.pi / 2.0, np.pi / 2.0) action[1] = self._front_wheel_steering_rotation_state print(80 * '=') print(action) self._action = Action(command=action)
""" Run a random mini environment with egocentric costmap observation wrapper. """ from __future__ import print_function from __future__ import absolute_import from __future__ import division import numpy as np from bc_gym_planning_env.envs.base.action import Action from bc_gym_planning_env.envs.mini_env import RandomMiniEnv from bc_gym_planning_env.envs.egocentric import EgocentricCostmap if __name__ == '__main__': for seed in range(1000): print(seed) env = RandomMiniEnv() env = EgocentricCostmap(env) env.seed(seed) env.reset() env.render() done = False while not done: action = Action(command=np.array([0.3, 0.0])) _, _, done, _ = env.step(action) env.render()
def rollout(self, batch_info, model): """ Calculate env rollout """ observation_accumulator = {} # Device tensors observation_accumulator['environment'] = [] observation_accumulator['goal'] = [] action_accumulator = [] # Device tensors value_accumulator = [] # Device tensors dones_accumulator = [] # Device tensors rewards_accumulator = [] # Device tensors episode_information = [] # Python objects logprobs_accumulator = [] # Device tensors if self.hidden_state is None and model.is_recurrent: self.hidden_state = torch.zeros( (self.last_observation.size(0), model.state_dim), device=self.device, dtype=torch.float32 ) # Remember rollout initial state, we'll use that for learning as well initial_hidden_state = self.hidden_state for step_idx in range(self.number_of_steps): if model.is_recurrent: step = model.step(self.last_observation, state=self.hidden_state) self.hidden_state = step['state'] else: step = model.step(self.last_observation) actions, values, logprobs = step['actions'], step['values'], step['logprobs'] if isinstance(self.last_observation, dict): observation_accumulator['environment'].append(self.last_observation['environment']) observation_accumulator['goal'].append(self.last_observation['goal']) else: observation_accumulator['environment'].append(self.last_observation) action_accumulator.append(actions) value_accumulator.append(values) logprobs_accumulator.append(logprobs) actions_numpy = actions.detach().cpu().numpy() if len(actions_numpy.shape) > 1: actions_numpy = actions_numpy[0] action_class = Action(command=actions_numpy) new_obs, new_rewards, new_dones, new_infos = self.environment.step(action_class) # Done is flagged true when the episode has ended AND the frame we see is already a first frame from the # next episode dones_tensor = self._to_tensor(new_dones.astype(np.float32)) dones_accumulator.append(dones_tensor) self.last_observation = self._dict_to_tensor(new_obs) if model.is_recurrent: # Zero out state in environments that have finished self.hidden_state = self.hidden_state * (1.0 - dones_tensor.unsqueeze(-1)) rewards_accumulator.append(self._to_tensor(new_rewards.astype(np.float32))) episode_information.append(new_infos) if model.is_recurrent: final_values = model.value(self.last_observation, state=self.hidden_state) else: final_values = model.value(self.last_observation) if len(observation_accumulator['goal']) > 0: observations_buffer = {} observations_buffer['environment'] = torch.stack(observation_accumulator['environment']) observations_buffer['goal'] = torch.stack(observation_accumulator['goal']) else: observations_buffer = torch.stack(observation_accumulator['environment']) rewards_buffer = torch.stack(rewards_accumulator) actions_buffer = torch.stack(action_accumulator) # Actions may have various different dtypes values_buffer = torch.stack(value_accumulator) dones_buffer = torch.stack(dones_accumulator) logprobs_buffer = torch.stack(logprobs_accumulator) # Generalized Advantage Estimation # https://arxiv.org/abs/1506.02438 advantages = self.discount_bootstrap_gae( rewards_buffer, dones_buffer, values_buffer, final_values, self.discount_factor, self.gae_lambda ) returns = advantages + values_buffer return Trajectories( num_steps=advantages.size(0), num_envs=advantages.size(1), environment_information=episode_information, transition_tensors={ 'observations': observations_buffer, 'estimated_returns': returns, 'dones': dones_buffer, 'actions': actions_buffer, 'estimated_values': values_buffer, 'estimated_advantages': advantages, 'action:logprobs': logprobs_buffer, }, rollout_tensors={ 'initial_hidden_state': initial_hidden_state, 'final_estimated_values': final_values } )