def sample_paths_one_core(N, policy, T=1e6, env=None, env_name=None, pegasus_seed=None, mode='sample'): """ params: N : number of sample points policy : policy to be used to sample the data T : maximum length of trajectory env : env object to sample from env_name : name of env to be sampled from (one of env or env_name must be specified) pegasus_seed : seed for environment (numpy speed must be set externally) """ if env_name is None and env is None: print("No environment specified! Error will be raised") if env is None: env = get_environment(env_name) # if pegasus_seed is not None: env.env._seed(pegasus_seed) T = min(T, env.horizon) start_time = timer.time() # print("####### Gathering Samples #######") sampled_so_far = 0 paths = [] seed = pegasus_seed if pegasus_seed is not None else 0 while sampled_so_far < N: if mode == 'sample': this_path = base_sampler.do_rollout(1, policy, T, env, env_name, seed) # do 1 rollout elif mode == 'evaluation': this_path = eval_sampler.do_evaluation_rollout( 1, policy, env, env_name, seed) else: # print("Mode has to be either 'sample' for training time or 'evaluation' for test time performance") break paths.append(this_path[0]) seed += 1 sampled_so_far += len(this_path[0]["rewards"]) # print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time()-start_time) ) # print("................................. | >>>> # samples = %i # trajectories = %i " % (sampled_so_far, len(paths)) ) return paths
def do_evaluation_rollout(N, policy, T=1e6, env=None, env_name=None, pegasus_seed=None): """ params: N : number of trajectories policy : policy to be used to sample the data T : maximum length of trajectory env : env object to sample from env_name : name of env to be sampled from (one of env or env_name must be specified) pegasus_seed : seed for environment (numpy speed must be set externally) """ if env_name is None and env is None: print("No environment specified! Error will be raised") if env is None: env = get_environment(env_name) if pegasus_seed is not None: try: env.env._seed(pegasus_seed) except AttributeError as e: env.env.seed(pegasus_seed) T = min(T, env.horizon) # print("####### Worker started #######") paths = [] for ep in range(N): # Set pegasus seed if asked if pegasus_seed is not None: seed = pegasus_seed + ep try: env.env._seed(seed) except AttributeError as e: env.env.seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] o = env.reset() done = False t = 0 while t < T and done != True: _, agent_info = policy.get_action(o) a = agent_info['evaluation'] next_o, r, done, env_info = env.step(a) # observations.append(o.ravel()) observations.append(o) actions.append(a) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done) paths.append(path) # print("====== Worker finished ======") return paths
def trajectory_generator(self, beta, dagger_ep): if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) T = self.env.horizon paths = [] print('Generating trajectories') for ep in tqdm(range(self.num_traj_gen)): if self.seed is not None: seed = self.seed + ep + dagger_ep * self.num_traj_gen self.env.env.env._seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] path_image_pixels = [] all_robot_info = [] o = self.env.reset() robot_info = None if self.has_robot_info: o, env_info = self.env.reset() robot_info = env_info['robot_info'] done = False t = 0 while t < T and done != True: r = np.random.random() image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name, device_id=self.device_id) a_expert, agent_info_expert = self.expert_policy.get_action(o) img = image_pix prev_img = image_pix prev_prev_img = image_pix if t > 0: prev_img = path_image_pixels[t - 1] if t > 1: prev_prev_img = path_image_pixels[t - 2] prev_prev_img = np.expand_dims(prev_prev_img, axis=0) prev_img = np.expand_dims(prev_img, axis=0) img = np.expand_dims(img, axis=0) o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0) a_viz, agent_info_viz = self.viz_policy.get_action( o_img, use_seq=self.use_seq, use_cuda=self.use_cuda, robot_info=robot_info) if r <= beta: a = agent_info_expert['evaluation'] agent_info = agent_info_expert else: a = a_viz agent_info = agent_info_viz next_o, r, done, env_info = self.env.step(a) observations.append(o) actions.append(agent_info_expert['evaluation']) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) path_image_pixels.append(image_pix) if self.has_robot_info: all_robot_info.append(robot_info) robot_info = env_info['robot_info'] o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done, image_pixels=np.array(path_image_pixels)) if self.has_robot_info: path['robot_info'] = np.array(all_robot_info) paths.append(path) return paths
def __init__(self, dagger_epochs, expert_policy, viz_policy, old_data_loader: DataLoader, val_data_loader: DataLoader, log_dir, pol_dir_name, has_robot_info=False, beta_start=1.0, beta_decay=0.9, beta_cutoff=0.0, optimizer=None, camera_name=None, lr=3e-4, log_step=10, bins=0, use_img=True, use_seq=True, trainer_epochs=5, num_traj_gen=20, env_name=None, env=None, save_epoch=1, eval_num_traj=25, seed=500, sliding_window=0, device_id=None, use_cuda=False): self.beta = beta_start self.dagger_epochs = dagger_epochs self.expert_policy = expert_policy self.viz_policy = viz_policy self.old_data_loader = old_data_loader self.beta_decay = beta_decay self.camera_name = camera_name self.has_robot_info = has_robot_info self.beta_cutoff = beta_cutoff self.log_step = log_step self.bins = bins self.pol_dir_name = pol_dir_name self.use_img = use_img self.use_seq = use_seq self.trainer_epochs = trainer_epochs self.val_data_loader = val_data_loader self.num_traj_gen = num_traj_gen self.eval_num_traj = eval_num_traj self.env = env self.env_name = env_name self.save_epoch = save_epoch self.sliding_window = sliding_window self.device_id = device_id self.use_cuda = use_cuda # filewriters self.log_tf_train = Logger(os.path.join(LOG_DIR, log_dir)) self.log_tf_val = Logger(os.path.join(LOG_DIR, log_dir, 'validation')) self.log_expert = Logger(os.path.join(LOG_DIR, log_dir, 'expert')) self.log_viz = Logger(os.path.join(LOG_DIR, log_dir, 'viz')) self.loss_fn = torch.nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam( self.viz_policy.trainable_params, lr=lr) if optimizer is None else optimizer self.seed = seed if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) self.expert_reward, _, _ = self.env.evaluate_policy( self.expert_policy, num_episodes=self.eval_num_traj, mean_action=True, seed=self.seed, device_id=self.device_id)
def trajectory_generator(self, beta, dagger_ep): if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) T = self.env.horizon paths = [] print('Generating trajectories') for ep in tqdm(range(self.num_traj_gen)): if self.seed is not None: seed = self.seed + ep + dagger_ep * self.num_traj_gen self.env.env.env._seed(seed) np.random.seed(seed) else: np.random.seed() observations = [] actions = [] rewards = [] agent_infos = [] env_infos = [] path_image_pixels = [] all_robot_info = [] o = self.env.reset() robot_info = None if self.has_robot_info: o, env_info = self.env.reset() robot_info = env_info['robot_info'] done = False t = 0 while t < T and done != True: r = np.random.random() image_pix = self.env.get_pixels(frame_size=FRAME_SIZE, camera_name=self.camera_name, device_id=self.device_id) a_expert, agent_info_expert = self.expert_policy.get_action(o) img = image_pix prev_img = image_pix prev_prev_img = image_pix if t > 0: prev_img = path_image_pixels[t - 1] if t > 1: prev_prev_img = path_image_pixels[t - 2] prev_prev_img = np.expand_dims(prev_prev_img, axis=0) prev_img = np.expand_dims(prev_img, axis=0) img = np.expand_dims(img, axis=0) o_img = np.concatenate((prev_prev_img, prev_img, img), axis=0) a_viz, agent_info_viz = self.viz_policy.get_action(o_img, use_seq=self.use_seq, use_cuda=self.use_cuda, robot_info=robot_info) if r <= beta: a = agent_info_expert['evaluation'] agent_info = agent_info_expert else: a = a_viz agent_info = agent_info_viz next_o, r, done, env_info = self.env.step(a) observations.append(o) actions.append(agent_info_expert['evaluation']) rewards.append(r) agent_infos.append(agent_info) env_infos.append(env_info) path_image_pixels.append(image_pix) if self.has_robot_info: all_robot_info.append(robot_info) robot_info = env_info['robot_info'] o = next_o t += 1 path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), terminated=done, image_pixels=np.array(path_image_pixels) ) if self.has_robot_info: path['robot_info'] = np.array(all_robot_info) paths.append(path) return paths
def __init__(self, dagger_epochs, expert_policy, viz_policy, old_data_loader: DataLoader, val_data_loader: DataLoader, log_dir, pol_dir_name, has_robot_info=False, beta_start=1.0, beta_decay=0.9, beta_cutoff=0.0, optimizer=None, camera_name=None, lr=3e-4, log_step=10, bins=0, use_img=True, use_seq=True, trainer_epochs=5, num_traj_gen=20, env_name=None, env=None, save_epoch=1, eval_num_traj=25, seed=500, sliding_window=0, device_id=None, use_cuda=False): self.beta = beta_start self.dagger_epochs = dagger_epochs self.expert_policy = expert_policy self.viz_policy = viz_policy self.old_data_loader = old_data_loader self.beta_decay = beta_decay self.camera_name = camera_name self.has_robot_info = has_robot_info self.beta_cutoff = beta_cutoff self.log_step = log_step self.bins = bins self.pol_dir_name = pol_dir_name self.use_img = use_img self.use_seq = use_seq self.trainer_epochs = trainer_epochs self.val_data_loader = val_data_loader self.num_traj_gen = num_traj_gen self.eval_num_traj = eval_num_traj self.env = env self.env_name = env_name self.save_epoch = save_epoch self.sliding_window = sliding_window self.device_id = device_id self.use_cuda = use_cuda # filewriters self.log_tf_train = Logger(os.path.join(LOG_DIR, log_dir)) self.log_tf_val = Logger(os.path.join(LOG_DIR, log_dir, 'validation')) self.log_expert = Logger(os.path.join(LOG_DIR, log_dir, 'expert')) self.log_viz = Logger(os.path.join(LOG_DIR, log_dir, 'viz')) self.loss_fn = torch.nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam(self.viz_policy.trainable_params, lr=lr) if optimizer is None else optimizer self.seed = seed if self.env_name is None and self.env is None: print("No environment specified! Error will be raised") if self.env is None: self.env = get_environment(self.env_name) self.expert_reward, _, _ = self.env.evaluate_policy(self.expert_policy, num_episodes=self.eval_num_traj, mean_action=True, seed=self.seed, device_id=self.device_id)