def start_worker(self): """Initialize the sampler.""" n_envs = self.n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.env.spec
def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: print(self.algo.env) if isinstance(self.algo.env.env, EmbeddedPolicyEnv): envs = [ EmbeddedPolicyEnv( pickle.loads( pickle.dumps(self.algo.env.env._wrapped_env)), self.algo.env.env._wrapped_policy) for _ in range(n_envs) ] else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec
def start_worker(self): n_envs = self.n_envs if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec
def start_worker(self): """Initialize the sampler.""" n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.rollout_batch_size) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec
class OffPolicyVectorizedSampler(BatchSampler): """This class implements OffPolicyVectorizedSampler.""" def __init__(self, algo, n_envs=None): """ Construct an OffPolicyVectorizedSampler. :param algo: Algorithms. :param n_envs: Number of parallelized sampling envs. """ super(OffPolicyVectorizedSampler, self).__init__(algo) self.n_envs = n_envs @overrides def start_worker(self): """Initialize the sampler.""" n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.rollout_batch_size) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec @overrides def shutdown_worker(self): """Terminate workers if necessary.""" self.vec_env.close() @overrides def obtain_samples(self, itr): """ Collect samples for the given iteration number. :param itr: Iteration number. :return: A list of paths. """ paths = [] obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 batch_samples = self.vec_env.num_envs * self.algo.max_path_length policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_samples: policy.reset(dones) if self.algo.input_include_goal: obs = [obs["observation"] for obs in obses] d_g = [obs["desired_goal"] for obs in obses] a_g = [obs["achieved_goal"] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transition( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs["observation"] for next_obs in next_obses ], next_achieved_goal=[ next_obs["achieved_goal"] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transition( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], ) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) if done: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if self.algo.es: self.algo.es.reset() obses = next_obses return paths @overrides def process_samples(self, itr, paths): """ Return processed sample data based on the collected paths. :param itr: Iteration number. :param paths: A list of collected paths. :return: Processed sample data. """ success_history = [] for path in paths: if "is_success" in path["env_infos"]: success = np.array(path["env_infos"]["is_success"]) success_rate = np.mean(success) success_history.append(success_rate) undiscounted_returns = [sum(path["rewards"]) for path in paths] samples_data = dict(undiscounted_returns=undiscounted_returns, success_history=success_history) return samples_data
class OffPolicyVectorizedSampler(BatchSampler): """This class implements OffPolicyVectorizedSampler. Args: algo(garage.np.RLAlgorithm): Algorithm. env(garage.envs.GarageEnv): Environment. n_envs(int): Number of parallel environments managed by sampler. """ def __init__(self, algo, env, n_envs=None, no_reset=True): if n_envs is None: n_envs = int(algo.rollout_batch_size) super(OffPolicyVectorizedSampler, self).__init__(algo, env, n_envs) self.n_envs = n_envs self.no_reset = no_reset self._last_obses = None self._last_uncounted_discount = [0] * n_envs self._last_running_length = [0] * n_envs self._last_success_count = [0] * n_envs @overrides def start_worker(self): """Initialize the sampler.""" n_envs = self.n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.env.spec @overrides def shutdown_worker(self): """Terminate workers if necessary.""" self.vec_env.close() @overrides def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths @overrides def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr(int): Iteration number. paths(list): A list of collected paths. Returns: list: Processed sample data. """ success_history = [ path['success_count'] / path['running_length'] for path in paths ] undiscounted_returns = [path['undiscounted_return'] for path in paths] samples_data = dict( undiscounted_returns=undiscounted_returns, success_history=success_history) return samples_data
class OnPolicyVectorizedSampler(BatchSampler): def __init__(self, algo, env, n_envs=1): super(OnPolicyVectorizedSampler, self).__init__(algo, env, n_envs) self.n_envs = n_envs @overrides def start_worker(self): n_envs = self.n_envs envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.env.spec @overrides def shutdown_worker(self): self.vec_env.close() @overrides def obtain_samples(self, itr, batch_size=None, whole_paths=True): logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
class OnPolicyVectorizedSampler(BatchSampler): def __init__(self, algo, n_envs=None): super(OnPolicyVectorizedSampler, self).__init__(algo) self.n_envs = n_envs @overrides def start_worker(self): n_envs = self.n_envs if n_envs is None: n_envs = int(self.algo.batch_size / self.algo.max_path_length) n_envs = max(1, min(n_envs, 100)) if getattr(self.algo.env, 'vectorized', False): self.vec_env = self.algo.env.vec_env_executor( n_envs=n_envs, max_path_length=self.algo.max_path_length) else: envs = [ pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs) ] self.vec_env = VecEnvExecutor( envs=envs, max_path_length=self.algo.max_path_length) self.env_spec = self.algo.env.spec @overrides def shutdown_worker(self): self.vec_env.close() @overrides def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, next_obses, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], next_observations=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["next_observations"].append( next_observation) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), next_observation=tensor_utils.stack_tensor_list( running_paths[idx]["next_observations"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths