def obtain_samples(self, itr, log=True, log_prefix='', show_pbar=True): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger show_pbar (boolean): whether to show progress bar Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)] if show_pbar: pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy # initial reset of envs obses = self.vec_env.reset() while n_samples < self.total_samples: # execute policy t = time.time() obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) actions, agent_infos = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() actions = np.concatenate(actions) # stack meta batch next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(int(done)) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths[idx // self.envs_per_task].append(dict( observations=np.asarray(running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"], dtype=np.float), env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() if show_pbar: pbar.update(new_samples) n_samples += new_samples obses = next_obses if show_pbar: pbar.stop() self.total_timesteps_sampled += self.total_samples if log: tabular.record(log_prefix + "PolicyExecTime", policy_time) tabular.record(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.meta_batch_size) # initial reset of meta_envs obses = self.vec_env.reset() while n_samples < self.total_samples: # execute policy t = time.time() obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) if random: actions = np.stack([[self.env.action_space.sample()] for _ in range(len(obses))], axis=0) agent_infos = [[{ 'mean': np.zeros_like(self.env.action_space.sample()), 'log_std': np.zeros_like(self.env.action_space.sample()) }] * self.envs_per_task] * self.meta_batch_size else: actions, agent_infos = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() actions = np.concatenate(actions) # stack meta batch next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths[idx // self.envs_per_task].append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (list): A list of dicts with the samples """ # initial setup / preparation paths = [] n_samples = 0 num_envs = self.vec_env.num_envs running_paths = [ _get_empty_running_paths_dict() for _ in range(num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.vec_env.num_envs) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) while n_samples < self.total_samples: # execute policy t = time.time() if random: actions = np.stack( [self.env.action_space.sample() for _ in range(num_envs)], axis=0) agent_infos = {} else: a_bs = self.adapt_batch_size if a_bs is not None and len( running_paths[0]['observations']) > a_bs + 1: adapt_obs = [ np.stack(running_paths[idx]['observations'][-a_bs - 1:-1]) for idx in range(num_envs) ] adapt_act = [ np.stack(running_paths[idx]['actions'][-a_bs - 1:-1]) for idx in range(num_envs) ] adapt_next_obs = [ np.stack(running_paths[idx]['observations'][-a_bs:]) for idx in range(num_envs) ] policy.dynamics_model.switch_to_pre_adapt() policy.dynamics_model.adapt(adapt_obs, adapt_act, adapt_next_obs) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False, deterministic=False, eval=False, multiple_trajectory=1, dynamics_model=None): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation multiple_trajectories = [] for _ in range(multiple_trajectory): paths = [] n_samples = 0 running_paths = _get_empty_running_paths_dict() if log: pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True]) # initial reset of meta_envs obs = np.asarray(self.env.reset()) ts = 0 while n_samples < self.total_samples: # execute policy t = time.time() if eval: H = self.mpc.horizon mean_list = [] std_list = [] observation = obs for t in range(H + 1): action, agent_info = policy.get_action(observation) action = agent_info['mean'] mean_list.append(action) std_list.append(agent_info['log_std']) if self.policy.squashed: action = np.tanh(action) if observation.ndim == 1: observation = observation[None] if action.ndim == 1: action = action[None] observation = dynamics_model.predict( observation, action) observation = observation.reshape((-1)) action, _ = self.mpc.get_actions(obs[None], mean_list, std_list) if action.ndim == 2: action = action[0] else: obs = obs.reshape((-1)) if random: action = self.env.action_space.sample() agent_info = {} elif deterministic: action, agent_info = policy.get_action(obs) action = agent_info['mean'] if self.policy.squashed: action = np.tanh(action) else: action, agent_info = policy.get_action(obs) if action.ndim == 2: action = action[0] policy_time += time.time() - t # step environments t = time.time() next_obs, reward, done, env_info = self.env.step(action) ts += 1 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(obs) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done or ts >= self.max_path_length: paths.append( dict( observations=np.asarray( running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=[], agent_infos=[], # env_infos=utils.stack_tensor_dict_list(running_paths["env_infos"]), # agent_infos=utils.stack_tensor_dict_list(running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() if done or ts >= self.max_path_length: next_obs = self.env.reset() ts = 0 if log: pbar.update(new_samples) n_samples += new_samples obs = next_obs multiple_trajectories.append(paths) if log: pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return multiple_trajectories
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = _get_empty_running_paths_dict() pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True]) # initial reset of meta_envs obs = np.asarray(self.env.reset()) ts = 0 while n_samples < self.total_samples: # execute policy t = time.time() if random: action = self.env.action_space.sample() agent_info = {} else: action, agent_info = policy.get_action(obs) if action.ndim == 2: action = action[0] policy_time += time.time() - t # step environments t = time.time() next_obs, reward, done, env_info = self.env.step(action) ts += 1 done = done or ts >= self.max_path_length if done: next_obs = self.env.reset() ts = 0 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(obs) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray(running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obs = next_obs pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix=''): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = dict() pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy # initial reset of envs obses = self.env.reset() while n_samples < self.total_samples: # execute policy t = time.time() obs_per_task = np.array(obses) actions, logits, values = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts new_samples = 0 for observation, action, logit, reward, value, finish_time in zip( obses, actions, logits, rewards, values, env_infos): running_paths["observations"] = observation running_paths["actions"] = action running_paths["logits"] = logit running_paths["rewards"] = reward running_paths["values"] = value running_paths["finish_time"] = finish_time # handling paths.append( dict( observations=np.squeeze( np.asarray(running_paths["observations"])), actions=np.squeeze(np.asarray( running_paths["actions"])), logits=np.squeeze(np.asarray(running_paths["logits"])), rewards=np.squeeze(np.asarray( running_paths["rewards"])), values=np.squeeze(np.asarray(running_paths["values"])), finish_time=np.squeeze( np.asarray(running_paths["finish_time"])))) # if running path is done, add it to paths and empty the running path new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] running_paths = _get_empty_running_paths_dict() pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy for idx in range(self.meta_batch_size): ts = 0 n_samples = 0 init_obs = np.expand_dims(self.env.reset(), 0).copy() obses = [init_obs for _ in range(self.meta_batch_size)] policy.reset(dones=[True] * self.meta_batch_size) while n_samples < self.samples_per_task: # execute policy t = time.time() if random: actions = np.stack([[self.env.action_space.sample()] for _ in range(len(obses))], axis=0) agent_infos = [[{ 'mean': np.zeros_like(self.env.action_space.sample()), 'log_std': np.zeros_like(self.env.action_space.sample()) }] * self.envs_per_task] * self.meta_batch_size else: actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t # step environments t = time.time() action, agent_info = actions[idx][0], agent_infos[idx][0] observation = obses[idx][0].copy() next_obs, reward, done, env_info = self.env.step(action) ts += 1 done = done or ts >= self.max_path_length if done: next_obs = self.env.reset() # time.sleep(1) ts = 0 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(observation) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths[idx].append( dict( observations=np.asarray( running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses[idx][0] = next_obs self.total_timesteps_sampled += n_samples pbar.stop() if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False, deterministic=False, sinusoid=False, verbose=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] if verbose: pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.vec_env.num_envs) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) while n_samples < self.total_samples: # execute policy t = time.time() if self.vae is not None: obses = np.array(obses) obses = self.vae.encode(obses) if random: actions = np.stack([ self.env.action_space.sample() for _ in range(self.vec_env.num_envs) ], axis=0) agent_infos = {} elif deterministic: actions, agent_infos = policy.get_actions(obses) actions = [a_i['mean'] for a_i in agent_infos] elif sinusoid: action_space = self.env.action_space.shape[0] num_envs = self.vec_env.num_envs actions = np.stack([ policy.get_sinusoid_actions(action_space, t / policy.horizon * 2 * np.pi) for _ in range(num_envs) ], axis=0) agent_infos = dict() else: obses = np.array(obses) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() if verbose: pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses if verbose: pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "TimeStepsCtr", self.total_timesteps_sampled) logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False, advance_curriculum=False, policy=None, teacher_dict={}, max_action=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] total_paths = self.rollouts_per_meta_task * self.meta_batch_size * self.envs_per_task pbar = ProgBar(total_paths) policy_time, env_time = 0, 0 if policy is None: policy = self.policy policy.reset(dones=[True] * self.meta_batch_size) if self.reward_predictor is not None: self.reward_predictor.reset(dones=[True] * self.meta_batch_size) if self.supervised_model is not None: self.supervised_model.reset(dones=[True] * self.meta_batch_size) # initial reset of meta_envs if advance_curriculum: self.vec_env.advance_curriculum() self.update_tasks() obses = self.vec_env.reset() num_paths = 0 itrs = 0 while num_paths < total_paths: print("Loop", num_paths, total_paths, itrs) itrs += 1 t = time.time() obses = self.obs_preprocessor(obses, teacher_dict) if random: actions = np.stack([[self.env.action_space.sample()] for _ in range(len(obses))], axis=0) agent_infos = [[{ 'mean': np.zeros_like(self.env.action_space.sample()), 'log_std': np.zeros_like(self.env.action_space.sample()) }] * self.envs_per_task] * self.meta_batch_size else: actions, agent_infos = policy.get_actions_t(obses) if max_action: # TODO: double check this still works assert False, "We haven't checked this still works with the new model; if it does, feel free to delete." original_action_shape = actions.shape actions = [[[np.argmax(d['probs'])] for d in agent_info] for agent_info in agent_infos] actions = np.array(actions, dtype=np.int32) if not actions.shape == original_action_shape: assert False, (actions.shape, original_action_shape) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t new_samples = 0 new_paths = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: curr_path = paths[idx // self.envs_per_task] if len(curr_path) >= self.rollouts_per_meta_task: continue paths[idx // self.envs_per_task].append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) num_paths += 1 new_paths += 1 new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(new_paths) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += n_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (list): A list of dicts with the samples """ # initial setup / preparation paths = [] n_samples = 0 num_envs = self.vec_env.num_envs running_paths = [ _get_empty_running_paths_dict() for _ in range(num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy if self.use_cem: for i in range(num_envs): self.reset_cem(i) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) state_counts = [0] * self.vec_env.num_envs # history self.obs_dim = obses.shape[1] history_state = np.zeros( (obses.shape[0], self.obs_dim * self.history_length)) history_act = np.zeros( (obses.shape[0], self.act_dim * self.history_length)) while n_samples < self.total_samples: # execute policy t = time.time() if random: actions = np.stack( [self.env.action_space.sample() for _ in range(num_envs)], axis=0) agent_infos = {} else: if self.use_cem: if self.context: cem_solutions, agent_infos = policy.get_actions( obses, init_mean=self.prev_sol, init_var=self.init_var, cp_obs=history_state, cp_act=history_act) else: cem_solutions, agent_infos = policy.get_actions( obses, init_mean=self.prev_sol, init_var=self.init_var) self.prev_sol[:, :-1] = cem_solutions[:, 1:].copy() self.prev_sol[:, -1:] = 0. actions = cem_solutions[:, 0].copy() else: if self.context: actions, agent_infos = policy.get_actions( obses, cp_obs=history_state, cp_act=history_act) else: actions, agent_infos = policy.get_actions(obses) if len(self.env.action_space.shape) == 0: actions = actions.reshape(-1) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if len(self.env.action_space.shape) == 0: action = np.eye(self.act_dim)[action] else: if action.ndim == 0: action = np.expand_dims(action, 0) assert action.ndim == 1, (action, action.shape) # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) running_paths[idx]["cp_obs"].append(history_state[idx].copy()) running_paths[idx]["cp_act"].append(history_act[idx].copy()) # making a history buffer if state_counts[idx] < self.history_length: if self.state_diff: history_state[idx][state_counts[idx] * self.obs_dim:( state_counts[idx] + 1) * self.obs_dim] = next_obses[idx] - observation else: history_state[idx][state_counts[idx] * self.obs_dim:(state_counts[idx] + 1) * self.obs_dim] = observation history_act[idx][state_counts[idx] * self.act_dim:(state_counts[idx] + 1) * self.act_dim] = action else: history_state[idx][:-self.obs_dim] = history_state[idx][ self.obs_dim:] if self.state_diff: history_state[idx][ -self.obs_dim:] = next_obses[idx] - observation else: history_state[idx][-self.obs_dim:] = observation history_act[idx][:-self. act_dim] = history_act[idx][self.act_dim:] history_act[idx][-self.act_dim:] = action # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), cp_obs=np.asarray(running_paths[idx]["cp_obs"]), cp_act=np.asarray(running_paths[idx]["cp_act"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() if not random and self.use_cem: self.reset_cem(idx) state_counts[idx] = 0 history_state[idx] = np.zeros( (self.obs_dim * self.history_length)) history_act[idx] = np.zeros( (self.act_dim * self.history_length)) else: state_counts[idx] += 1 pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix=''): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [ _get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy # initial reset of envs obses = self.vec_env.reset() while n_samples < self.total_samples: # execute policy t = time.time() # obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) obs_per_task = np.array(obses) actions, logits, values = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() # actions = np.concatenate(actions) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) # print("rewards shape is: ", np.array(rewards).shape) # print("finish time shape is: ", np.array(env_infos).shape) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts new_samples = 0 for idx, observation, action, logit, reward, value, done, task_finish_times in zip( itertools.count(), obses, actions, logits, rewards, values, dones, env_infos): # append new samples to running paths # handling for single_ob, single_ac, single_logit, single_reward, single_value, single_task_finish_time \ in zip(observation, action, logit, reward, value, task_finish_times): running_paths[idx]["observations"] = single_ob running_paths[idx]["actions"] = single_ac running_paths[idx]["logits"] = single_logit running_paths[idx]["rewards"] = single_reward running_paths[idx]["finish_time"] = single_task_finish_time running_paths[idx]["values"] = single_value paths[idx // self.envs_per_task].append( dict(observations=np.squeeze( np.asarray(running_paths[idx]["observations"])), actions=np.squeeze( np.asarray(running_paths[idx]["actions"])), logits=np.squeeze( np.asarray(running_paths[idx]["logits"])), rewards=np.squeeze( np.asarray(running_paths[idx]["rewards"])), finish_time=np.squeeze( np.asarray( running_paths[idx]["finish_time"])), values=np.squeeze( np.asarray(running_paths[idx]["values"])))) # if running path is done, add it to paths and empty the running path new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', test=False): print("total_samples:",self.total_samples) print("meta_batch_size:", self.meta_batch_size) print("max_path_length:" ,self.max_path_length) print("--------------obtaining", self.total_samples//self.meta_batch_size//self.max_path_length, "rollouts_per_task, for", self.meta_batch_size, "tasks..--------------") """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = OrderedDict() for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)] print(" runnng_paths length:", len(running_paths)) pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy # initial reset of envs obses = self.vec_env.reset() while n_samples < self.total_samples: # execute policy t = time.time() obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) actions, agent_infos = policy.get_actions(obs_per_task) policy_time += time.time() - t # step environments t = time.time() actions = np.concatenate(actions) # stack meta batch next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): # append new samples to running paths running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths[idx // self.envs_per_task].append(dict( observations=np.asarray(running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obses = next_obses pbar.stop() if not test: self.total_timesteps_sampled += self.total_samples print("------------self.total_timesteps_sampled:", self.total_timesteps_sampled, "-----------------") else: print("------------tested on:", self.total_samples // self.max_path_length, " rollouts-----------------") if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def obtain_samples(self, log=False, log_prefix='', buffer=None): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation pbar = ProgBar(self.max_path_length) policy_time, env_time = 0, 0 policy = self.policy policy.reset(dones=[True] * self.vec_env.num_envs) # initial reset of meta_envs obses = self.vec_env.reset(buffer) time_step = 0 list_observations = [] list_actions = [] list_rewards = [] list_dones = [] mask = np.ones((self.vec_env.num_envs, )) while time_step < self.max_path_length: # Execute policy t = time.time() if self.vae is not None: obses = np.array(obses) obses = self.vae.encode(obses) obses = np.split(obses, self.vec_env.num_envs, axis=0) if self.dynamics_model is not None: actions, agent_infos = policy.get_actions_batch( obses, update_filter=False) else: obses = np.array(obses) actions, agent_infos = policy.get_actions_batch( obses, update_filter=True) policy_time += time.time() - t # Step environments t = time.time() next_obses, rewards, dones, _ = self.vec_env.step(actions) next_obses, rewards, dones = np.array(next_obses), np.array( rewards), np.array(dones) rewards *= mask dones = dones + (1 - mask) mask *= (1 - dones) env_time += time.time() - t list_observations.append(obses) list_actions.append(actions) list_rewards.append(rewards) list_dones.append(dones) time_step += 1 obses = next_obses pbar.update(1) pbar.stop() self.total_timesteps_sampled += np.sum(1 - np.array(list_dones)) if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) samples_data = dict(observations=np.array(list_observations), actions=np.array(list_actions), rewards=np.array(list_rewards), returns=np.sum(list_rewards, axis=0), dones=np.array(list_dones)) return samples_data