def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr (int): Number of iteration. batch_size (int): Number of environment steps in one batch. whole_paths (bool): Whether to use whole path or truncated. Returns: list[dict]: A list of paths. """ if not batch_size: batch_size = self.algo.max_path_length * self.n_envs cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def _obtain_is_samples(self, _itr, batch_size=None, whole_paths=True): """Collect IS samples for the given iteration number. Args: _itr (int): Number of iteration. batch_size (int): Number of batch size. whole_paths (bool): Whether to use whole path or truncated. Returns: list: A list of paths. """ if batch_size is None: batch_size = self.algo.max_path_length paths = [] for hist_policy_distribution, hist_paths in self.get_history_list( self.n_backtrack): h_paths = self._sample_isweighted_paths( policy=self.algo.policy, hist_policy_distribution=hist_policy_distribution, max_samples=batch_size, paths=hist_paths, hist_variance_penalty=self.hist_variance_penalty, max_is_ratio=self.max_is_ratio, ess_threshold=self.ess_threshold, ) paths.extend(h_paths) if len(paths) > batch_size: paths = random.sample(paths, batch_size) return paths if whole_paths else truncate_paths(paths, batch_size)
def test_truncate_paths(self): paths = [ dict( observations=np.zeros((100, 1)), actions=np.zeros((100, 1)), rewards=np.zeros(100), env_infos=dict(), agent_infos=dict(lala=np.zeros(100)), ), dict( observations=np.zeros((50, 1)), actions=np.zeros((50, 1)), rewards=np.zeros(50), env_infos=dict(), agent_infos=dict(lala=np.zeros(50)), ), ] truncated = truncate_paths(paths, 130) assert len(truncated) == 2 assert len(truncated[-1]['observations']) == 30 assert len(truncated[0]['observations']) == 100 # make sure not to change the original one assert len(paths) == 2 assert len(paths[-1]['observations']) == 50
def test_truncates(self): truncated = utils.truncate_paths(self.paths, 130) assert len(truncated) == 2 assert len(truncated[-1]['observations']) == 30 assert len(truncated[0]['observations']) == 100 # make sure not to change the original one assert len(self.paths) == 2 assert len(self.paths[-1]['observations']) == 50
def obtain_samples(self, itr): cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" if not batch_size: batch_size = self.algo.max_path_length cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): if not batch_size: batch_size = self.algo.max_path_length * self.n_envs cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_is_samples(self, itr): paths = [] for hist_policy_distribution, hist_paths in self.get_history_list( self.n_backtrack): h_paths = self.sample_isweighted_paths( policy=self.algo.policy, hist_policy_distribution=hist_policy_distribution, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, paths=hist_paths, hist_variance_penalty=self.hist_variance_penalty, max_is_ratio=self.max_is_ratio, ess_threshold=self.ess_threshold, ) paths.extend(h_paths) if len(paths) > self.algo.batch_size: paths = random.sample(paths, self.algo.batch_size) if self.algo.whole_paths: return paths else: paths_truncated = truncate_paths(paths, self.algo.batch_size) return paths_truncated
def _obtain_is_samples(self, itr, batch_size=None, whole_paths=True): if batch_size is None: batch_size = self.algo.max_path_length paths = [] for hist_policy_distribution, hist_paths in self.get_history_list( self.n_backtrack): h_paths = self._sample_isweighted_paths( policy=self.algo.policy, hist_policy_distribution=hist_policy_distribution, max_samples=batch_size, paths=hist_paths, hist_variance_penalty=self.hist_variance_penalty, max_is_ratio=self.max_is_ratio, ess_threshold=self.ess_threshold, ) paths.extend(h_paths) if len(paths) > batch_size: paths = random.sample(paths, batch_size) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Number of iteration. batch_size (int): Number of environment steps in one batch. whole_paths (bool): Whether to use whole path or truncated. Returns: list[dict]: A list of paths. """ if not batch_size: batch_size = self.algo.max_path_length cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) return paths if whole_paths else truncate_paths(paths, batch_size)
def test_invalid_path(self): self.paths[0]['invalid'] = None with pytest.raises(ValueError): utils.truncate_paths(self.paths, 3)
def test_truncates_dict(self): truncated = utils.truncate_paths(self.paths_dict, 130) assert len(truncated) == 2 assert len(truncated[-1]['agent_infos']['lala']['baba']) == 30 assert len(truncated[0]['agent_infos']['lala']['baba']) == 100
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths, each path with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray(running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy with click.progressbar(length=batch_size, label='Sampling') as pbar: while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray( running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.update(len(obses)) obses = next_obses tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Parameters ---------- itr : int Iteration number. batch_size : int, optional How many simulation steps to run in each epoch. whole_paths : bool, optional Whether to return the full rollout paths data. """ if not batch_size: batch_size = self.algo.max_path_length * self.n_envs # cur_params = self.algo.policy.get_param_values() cur_policy_params = self.algo.policy.get_param_values() cur_env_params = self.algo.env.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, max_samples=batch_size, max_path_length=self.algo.max_path_length, env_params=cur_env_params, scope=self.algo.scope, ) # TODO: Doing the path correction here means the simulations will not be parallel. # Need to make own parallel sampler and put it there to make that work if self.open_loop: if self.batch_simulate: # import pdb; pdb.set_trace() paths = self.sim.batch_simulate_paths(paths=paths, reward_function=self.reward_function) else: for path in paths: s_0 = path["observations"][0] # actions = path['env_infos']['info']['actions'] actions = path['actions'] # pdb.set_trace() end_idx, info = self.sim.simulate(actions=actions, s_0=s_0) # print('----- Back from simulate: ', end_idx) if end_idx >= 0: # pdb.set_trace() self.slice_dict(path, end_idx) rewards = self.reward_function.give_reward( action=actions[end_idx], info=self.sim.get_reward_info() ) # print('----- Back from rewards: ', rewards) # pdb.set_trace() path["rewards"][end_idx] = rewards # info[:, -1] = path["rewards"][:info.shape[0]] # path['env_infos']['cache'] = info path['env_infos']['cache'] = np.zeros_like(path["rewards"]) # import pdb; pdb.set_trace() # return paths if whole_paths else truncate_paths(paths, batch_size) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated