def train(self): self.start_worker() self.init_opt() manager = multiprocessing.Manager() lock = manager.RLock() g_counter = manager.Value('counter', 0) g_opt_info = manager.Value('opt_info', self.opt_info) for epoch in range(self.n_epochs): logger.push_prefix('epoch %d | ' % epoch) logger.log('Training started') results = singleton_pool.run_each( train_worker, [(g_counter, g_opt_info, self.t_max, self.discount, lock, self.scope)] * singleton_pool.n_parallel) threshold = self.epoch_length pbar = ProgBarCounter(threshold) last_value = 0 while True: time.sleep(0.1) with lock: if g_counter.value >= threshold: logger.log('Training finished') pbar.stop() g_counter.value = 0 logger.log('Evaluating ...') self.evaluate(epoch, g_opt_info.value) logger.dump_tabular(with_prefix=False) logger.pop_prefix() break pbar.inc(g_counter.value - last_value) last_value = g_counter.value self.terminate_task()
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): reset_args = [reset_args] * self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() #logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) #logger.record_tabular(log_prefix+"EnvExecTime", env_time) #logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, return_dict=False, log=True, log_prefix='', traj_starting_obs=None, traj_starting_ts=None): """ :param itr: current iteration (int) for logging purposes :param return_dict: (boolean) weather to return a dict or a list :param log: (boolean) indicates whether to log :param log_prefix: (str) prefix to prepend to the log keys :param traj_starting_obs: (optional) starting observations to randomly choose from for rolling out trajectories [numpy array of shape (n_observations, ndim_obs), if env.reset() is called to get a initial observations :return: """ # return_dict: whether or not to return a dictionary or list form of paths assert traj_starting_obs is None or traj_starting_obs.ndim == 2 paths = {} for i in range(self.meta_batch_size): paths[i] = [] n_samples = 0 n_parallel_per_task = self.vec_env.num_envs // self.meta_batch_size obses = self.vec_env.reset(traj_starting_obs=traj_starting_obs) dones = np.asarray([True] * self.n_parallel) running_paths = [None] * self.n_parallel pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size_dynamics_samples: t = time.time() policy.reset(dones) # get actions from MAML policy obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) actions, agent_infos = policy.get_actions_batch(obs_per_task) assert actions.shape[0] == self.n_parallel policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, traj_starting_obs=traj_starting_obs, traj_starting_ts=traj_starting_ts) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx // n_parallel_per_task].append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() if log: logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) # path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) else: assert len(paths) == self.meta_batch_size return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], terminals=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["terminals"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), terminals=tensor_utils.stack_tensor_list( running_paths[idx]["terminals"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', preupdate=False, save_img_obs=False, contexts = None): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args) != np.ndarray): assert False, "debug, should we be using this?" print("WARNING, will vectorize reset_args") reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 path_nums = [0] * self.vec_env.num_envs # keeps track on which rollout we are for each environment instance obses = self.vec_env.reset(reset_args) if contexts: obses = np.concatenate([obses, contexts], axis = 1) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.batch_size) policy_time = 0 env_time = 0 process_time = 0 if contexts: policy = self.algo.post_policy else: policy = self.algo.policy while n_samples < self.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) # print("debug, agent_infos", agent_infos) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) # TODO: instead of receive obs from env, we'll receive it from the policy as a feed_dict if contexts: next_obses = np.concatenate([next_obses, contexts], axis = 1) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) # TODO: let's also add the incomplete running_paths to paths running_paths[idx] = None path_nums[idx] += 1 process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses # adding the incomplete paths # for idx in range(self.vec_env.num_envs): # if running_paths[idx] is not None: # paths[idx].append(dict( # observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), # )) pbar.stop() # logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) # logger.record_tabular(log_prefix + "EnvExecTime", env_time) # logger.record_tabular(log_prefix + "ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 import time while n_samples < self.algo.batch_size: t = time.time() self.algo.policy.reset(dones) actions, agent_infos = self.algo.policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in xrange(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in xrange(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def train(self): with tf.Session() as sess: sess.run(tf.initialize_all_variables()) self.start_worker() logger.log("Populating replay memory with random experience...") self.sampler.obtain_random_samples(self.pre_trained_size) start_time = time.time() total_time_step = 0 for itr in range(self.start_itr, self.n_itr + 1): itr_start_time = time.time() self.total_episodic_rewards = [ [] for _ in range(len(self.env.agents)) ] with logger.prefix('itr #%d | ' % itr): p_bar = ProgBarCounter(self.max_path_length) logger.log( "Running trajectories, obtaining samples and optimizing Q network..." ) for time_step in range(self.max_path_length): total_time_step += 1 paths = self.obtain_samples(itr) samples_data = self.process_samples(itr, paths) self.optimize_policy(itr, samples_data) if total_time_step % self.target_network_update == 0: logger.log( "Copying weights to target Q network...") self.target_policy.set_param_values( self.policy.get_param_values()) p_bar.inc(time_step + 1) if self.sampler.done: break p_bar.stop() self.sampler.done = True self.log_statistics(itr, time_step + 1) logger.log("Logging statistics...") logger.log("Logging diagnostics...") self.log_diagnostics(paths) if itr % self.save_param_update == 0: logger.log("Saving snapshot...") params = self.get_itr_snapshot( itr, samples_data) # , **kwargs) if self.store_paths: if isinstance(samples_data, list): params["paths"] = [ sd["paths"] for sd in samples_data ] else: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("Saved") logger.record_tabular('Time', time.time() - start_time) logger.record_tabular('ItrTime', time.time() - itr_start_time) logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): # reset_args: arguments to pass to the environments to reset # return_dict: whether or not to return a dictionary or list form of paths logger.log("Obtaining samples for iteration %d..." % itr) #paths = [] paths = {} for i in range(self.vec_env.num_envs): paths[i] = [] # if the reset args are not list/numpy, we set the same args for each env if reset_args is not None and (type(reset_args) != list and type(reset_args)!=np.ndarray): reset_args = [reset_args]*self.vec_env.num_envs n_samples = 0 obses = self.vec_env.reset(reset_args) dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions, reset_args) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths[idx].append(dict( observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular(log_prefix+"PolicyExecTime", policy_time) logger.record_tabular(log_prefix+"EnvExecTime", env_time) logger.record_tabular(log_prefix+"ProcessExecTime", process_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) #path_keys = flatten_list([[key]*len(paths[key]) for key in paths.keys()]) return paths
def obtain_samples(self, itr, max_path_length, batch_size, max_n_trajs=None): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 dones = np.asarray([True] * self.vec_env.n_envs) obses = self.vec_env.reset(dones) running_paths = [None] * self.vec_env.n_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.policy import time while n_samples < batch_size: t = time.time() if hasattr(self.vec_env, "handle_policy_reset"): self.vec_env.handle_policy_reset(policy, dones) else: policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, max_path_length=max_path_length) if np.any(dones): new_obses = self.vec_env.reset(dones) reset_idx = 0 for idx, done in enumerate(dones): if done: next_obses[idx] = new_obses[reset_idx] reset_idx += 1 env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.n_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.n_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if max_n_trajs is not None and len(paths) >= max_n_trajs: break process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, num_samples=None, log=True, log_prefix='RandomSampler-'): if num_samples is None: num_samples = self.algo.batch_size paths = [] n_samples_collected = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(num_samples) env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples_collected < num_samples: # random actions t = time.time() actions = np.stack([ self.vec_env.action_space.sample() for _ in range(len(obses)) ], axis=0) policy_time = time.time() - t agent_infos = {} t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples_collected += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() if log: logger.record_tabular(log_prefix + "PolicyExecTime", policy_time) logger.record_tabular(log_prefix + "EnvExecTime", env_time) logger.record_tabular(log_prefix + "ProcessExecTime", process_time) return paths
def obtain_samples(self, itr, oracle_policy): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] agent_only_paths = [] oracle_only_paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs agent_only_running_paths = [None] * self.vec_env.num_envs oracle_only_running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) agent_actions, binary_actions, agent_infos = policy.get_actions( obses) oracle_actions, oracle_agent_infos = oracle_policy.get_actions( obses) sigma = np.round(binary_actions) actions_1 = np.array([ sigma[0, 0] * agent_actions[0, :] + sigma[0, 1] * oracle_actions[0, :] ]) actions_2 = np.array([ sigma[1, 0] * agent_actions[1, :] + sigma[1, 1] * oracle_actions[1, :] ]) actions = np.concatenate((actions_1, actions_2), axis=0) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step( actions, itr) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict( observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), )) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None if sigma[0, 0] == 1 or sigma[1, 0] == 1: for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if agent_only_running_paths[idx] is None: agent_only_running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) agent_only_running_paths[idx]["observations"].append( observation) agent_only_running_paths[idx]["actions"].append(action) agent_only_running_paths[idx]["rewards"].append(reward) agent_only_running_paths[idx]["env_infos"].append(env_info) agent_only_running_paths[idx]["agent_infos"].append( agent_info) if done: agent_only_paths.append( dict( observations=self.env_spec.observation_space. flatten_n(agent_only_running_paths[idx] ["observations"]), actions=self.env_spec.action_space.flatten_n( agent_only_running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( agent_only_running_paths[idx]["rewards"]), env_infos=tensor_utils.stack_tensor_dict_list( agent_only_running_paths[idx] ["env_infos"]), agent_infos=tensor_utils. stack_tensor_dict_list( agent_only_running_paths[idx] ["agent_infos"]), )) n_samples += len( agent_only_running_paths[idx]["rewards"]) agent_only_running_paths[idx] = None """ To get paths taken by the oracle """ # elif sigma[0] == 0. or sigma[1] == 0.: # for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, # rewards, env_infos, agent_infos, # dones): # if oracle_only_running_paths[idx] is None: # oracle_only_running_paths[idx] = dict( # observations=[], # actions=[], # rewards=[], # env_infos=[], # agent_infos=[], # ) # oracle_only_running_paths[idx]["observations"].append(observation) # oracle_only_running_paths[idx]["actions"].append(action) # oracle_only_running_paths[idx]["rewards"].append(reward) # oracle_only_running_paths[idx]["env_infos"].append(env_info) # oracle_only_running_paths[idx]["agent_infos"].append(agent_info) # if done: # oracle_only_paths.append(dict( # observations=self.env_spec.observation_space.flatten_n(oracle_only_running_paths[idx]["observations"]), # actions=self.env_spec.action_space.flatten_n(oracle_only_running_paths[idx]["actions"]), # rewards=tensor_utils.stack_tensor_list(oracle_only_running_paths[idx]["rewards"]), # env_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["env_infos"]), # agent_infos=tensor_utils.stack_tensor_dict_list(oracle_only_running_paths[idx]["agent_infos"]), # )) # n_samples += len(oracle_only_running_paths[idx]["rewards"]) # oracle_only_running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) #return paths, agent_only_paths, oracle_only_paths return paths, agent_only_paths