def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False, deterministic=False): """ Generate a sample from a policy. Args: deterministic (bool): Boolean variable indicating whether a stochastic or deterministic action should be taken during the rollout. This is False (stochastic actions) by default. Returns: """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) if deterministic: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return None return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, always_return_paths=False): observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) # a = agent_info["mean"] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 # if d: # break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) if animated and not always_return_paths: return None return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def _process_trajectory(self, result): """Collect trajectory from ray object store. Converts that trajectory to garage friendly format. Args: - result: ray object id of ready to be collected trajectory. """ trajectory = ray.get(result) ready_worker_id = trajectory[0] self._active_worker_ids.remove(ready_worker_id) self._idle_worker_ids.append(ready_worker_id) trajectory = dict(observations=np.asarray(trajectory[1]), actions=np.asarray(trajectory[2]), rewards=tensor_utils.stack_tensor_list( trajectory[3]), agent_infos=trajectory[4], env_infos=trajectory[5]) num_returned_samples = len(trajectory['observations']) return trajectory, num_returned_samples
def obtain_samples(self, itr): """ Collect samples for the given iteration number. :param itr: Iteration number. :return: A list of paths. """ paths = [] obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 batch_samples = self.vec_env.num_envs * self.algo.max_path_length policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_samples: policy.reset(dones) if self.algo.input_include_goal: obs = [obs["observation"] for obs in obses] d_g = [obs["desired_goal"] for obs in obses] a_g = [obs["achieved_goal"] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transition( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs["observation"] for next_obs in next_obses ], next_achieved_goal=[ next_obs["achieved_goal"] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transition( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], ) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) if done: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def rollout(env, agent, *, max_path_length=np.inf, animated=False, speedup=1, deterministic=False): """Sample a single rollout of the agent in the environment. Args: agent(Policy): Agent used to select actions. env(gym.Env): Environment to perform actions in. max_path_length(int): If the rollout reaches this many timesteps, it is terminated. animated(bool): If true, render the environment after each step. speedup(float): Factor by which to decrease the wait time between rendered steps. Only relevant, if animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. Returns: worker_number(int): The worker number passed into this function. observations(np.array): Non-flattened array of observations. actions(np.array): Non-flattened array of actions. rewards(np.array): Array of rewards of shape (timesteps, 1). agent_infos(dict[str, np.array]): Dictionary of stacked, non-flattened `agent_info`s. env_infos(dict[str, np.array]): Dictionary of stacked, non-flattened `env_info`s. """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) if deterministic: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) return dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def obtain_samples_for_evaluation(self, num_paths=20): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] policy = self.algo.policy for i in range(num_paths): obses = self.evaluate_env.reset() #print(obses) dones = np.asarray([True] * self.evaluate_env.num_envs) running_paths = [None] * self.evaluate_env.num_envs policy.reset(dones) end_of_path = False for j in range(500): input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) obses = obs_normalized actions = self.algo.policy.get_actions(obs_normalized) if len(actions) > 1: actions = actions[0] agent_infos = None next_obses, rewards, dones, env_infos = self.evaluate_env.step( actions) original_next_obses = next_obses next_obses = tensor_utils.normalize_pixel_batch( self.env_spec, next_obses) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] if env_infos is None: env_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=0, # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=0, success_count=0) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 if done or j == 499: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx] ['success_count'])) running_paths[idx] = None end_of_path = True if end_of_path: break obses = original_next_obses #print(paths) return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, next_obses, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], next_observations=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["next_observations"].append( next_observation) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), next_observation=tensor_utils.stack_tensor_list( running_paths[idx]["next_observations"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def skill_rollout(env, agent, max_path_length=np.inf, skill_stopping_func=None, reset_start_rollout=True, keep_rendered_rgbs=False, animated=False, speedup=1 ): """ Perform one rollout in given environment. Code adopted from https://github.com/florensacc/snn4hrl :param env: AsaEnv environment to run in :param agent: Policy to sample actions from :param max_path_length: force terminate the rollout after this many steps :param skill_stopping_func: function ({actions, observations} -> bool) that indicates that skill execution is done :param reset_start_rollout: whether to reset the env when calling this function :param keep_rendered_rgbs: whether to keep a list of all rgb_arrays (for future video making) :param animated: whether to render env after each step :param speedup: speedup factor for animation """ observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] terminated = [] rendered_rgbs = [] if reset_start_rollout: o = env.reset() else: o = AsaEnv.get_current_obs_wrapped(env) agent.reset() path_length = 0 if animated: env.render() if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs.append(env.render(mode='rgb_array')) while path_length < max_path_length: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) observations.append(env.observation_space.flatten(o)) rewards.append(r) actions.append(env.action_space.flatten(a)) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 # natural termination if d: terminated.append(1) break terminated.append(0) # skill decides to terminate path_dict = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), # here it concatenates all lower-level paths! ) if skill_stopping_func and skill_stopping_func(path_dict): break o = next_o if keep_rendered_rgbs: # will return a new entry to the path dict with all rendered images rendered_rgbs.append(env.render(mode='rgb_array')) if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) # This is off as in the case of being an inner rollout, it will close the outer renderer! # if animated: # env.render(close=True) path_dict = dict( observations=tensor_utils.stack_tensor_list(observations), actions=tensor_utils.stack_tensor_list(actions), rewards=tensor_utils.stack_tensor_list(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), # here it concatenates all lower-level paths! # termination indicates if the rollout was terminated or if we simply reached the limit of steps: important # when BOTH happened at the same time, to still be able to know it was the done (for hierarchized envs) terminated=tensor_utils.stack_tensor_list(terminated), ) if keep_rendered_rgbs: path_dict['rendered_rgbs'] = tensor_utils.stack_tensor_list(rendered_rgbs) return path_dict