def step(self, action_n): results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def obtain_samples(self, itr): """ Collect samples for the given iteration number. :param itr: Iteration number. :return: A list of paths. """ paths = [] obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 batch_samples = self.vec_env.num_envs * self.algo.max_path_length policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_samples: policy.reset(dones) if self.algo.input_include_goal: obs = [obs["observation"] for obs in obses] d_g = [obs["desired_goal"] for obs in obses] a_g = [obs["achieved_goal"] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transition( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs["observation"] for next_obs in next_obses ], next_achieved_goal=[ next_obs["achieved_goal"] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transition( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], ) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) if done: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy with click.progressbar(length=batch_size, label='Sampling') as pbar: while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray( running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.update(len(obses)) obses = next_obses tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths, each path with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray(running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, input_obses, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Raises: ValueError: If the algorithm doesn't have an exploration_policy field. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.exploration_policy if policy is None: raise ValueError('OffPolicyVectoriizedSampler should only be used ' 'with an exploration_policy.') while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) actions, agent_infos = policy.get_actions(input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, reward, env_info, done, obs, next_obs, action in zip( itertools.count(), rewards, env_infos, dones, obses, next_obses, actions): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], observations=[], next_observations=[], actions=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['observations'].append(obs) running_paths[idx]['next_observations'].append(next_obs) running_paths[idx]['actions'].append(action) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) act_space = self._env_spec.action_space path_dict = {} path_dict['observations'] = obs_space.flatten_n( running_paths[idx]['observations']) path_dict['next_observations'] = obs_space.flatten_n( running_paths[idx]['next_observations']) path_dict['rewards'] = np.asarray( running_paths[idx]['rewards']).reshape(-1, 1) path_dict['terminals'] = np.asarray( running_paths[idx]['dones']).reshape(-1, 1) path_dict['actions'] = act_space.flatten_n( running_paths[idx]['actions']) self.algo.replay_buffer.add_path(path_dict) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 obses = next_obses return paths
def obtain_samples_for_evaluation(self, num_paths=20): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] policy = self.algo.policy for i in range(num_paths): obses = self.evaluate_env.reset() #print(obses) dones = np.asarray([True] * self.evaluate_env.num_envs) running_paths = [None] * self.evaluate_env.num_envs policy.reset(dones) end_of_path = False for j in range(500): input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) obses = obs_normalized actions = self.algo.policy.get_actions(obs_normalized) if len(actions) > 1: actions = actions[0] agent_infos = None next_obses, rewards, dones, env_infos = self.evaluate_env.step( actions) original_next_obses = next_obses next_obses = tensor_utils.normalize_pixel_batch( self.env_spec, next_obses) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] if env_infos is None: env_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=0, # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=0, success_count=0) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 if done or j == 499: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx] ['success_count'])) running_paths[idx] = None end_of_path = True if end_of_path: break obses = original_next_obses #print(paths) return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, next_obses, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], next_observations=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["next_observations"].append( next_observation) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), next_observation=tensor_utils.stack_tensor_list( running_paths[idx]["next_observations"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) new_episode_obs = None if "reset_new_obs" in env_infos: new_episode_obs = next_obses.copy() for i, reset_new_obs in env_infos["reset_new_obs"][0]: new_episode_obs[i] = reset_new_obs del env_infos["reset_new_obs"] #self.vec_env.envs[0].render() env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: payload = { "observation": obses, "action": actions, "reward": rewards * self.algo.reward_scale, "terminal": dones, "next_observation": next_obses } if env_infos and env_infos[0].get("ground_truth_state") is not None: payload["ground_truth_state"] = [env_info.get("ground_truth_state") for env_info in env_infos] self.algo.replay_buffer.add_transitions( **payload ) for idx, reward, env_info, q_val, done in zip(itertools.count(), rewards, env_infos, agent_infos["q_vals"], dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], q_vals=self._last_q_vals[idx].copy(), undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['q_vals'].append(q_val) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_q_vals[idx].append(q_val) self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), q_vals=np.asarray(running_paths[idx]["q_vals"]), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_q_vals[idx] = [] self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() if new_episode_obs: obses = new_episode_obs else: obses = next_obses self._last_obses = obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)