def collect_new_paths(self, max_path_length, num_steps, discard_incomplete_paths): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) path_len = len(path['actions']) if (path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths): break path_expert_len = 0 # if the path did not reach the goal, add expert demonstration if not path['rewards'][-1] > 0: path_expert = rollout( self._env, self._expert_policy, max_path_length=max_path_length_this_loop) # if expert demonstration successfully reached goal, add it to buffer if path_expert['rewards'][-1] > 0: paths.append(path_expert) path_expert_len = len(path_expert['actions']) else: print('No expert solution found.') # import pickle as pkl # import os # filename_fails = '/sequoia/data1/rstrudel/code/nmp/fails.pkl' # if os.path.exists(filename_fails): # with open(filename_fails, 'rb') as fpkl: # file_fails = pkl.load(fpkl) # else: # file_fails = [] # file_fails.append((self._env.idx_env, self._env.start, self._env.goal)) # with open(filename_fails, 'wb') as fpkl: # pkl.dump(file_fails, fpkl) num_steps_collected += path_len + path_expert_len paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def simulate_policy(args): data = torch.load(args.file) fig_dir = os.path.dirname(args.file) print(fig_dir) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() paths = [] while len(paths) < args.num_path: path = rollout( env, policy, max_path_length=args.H, render=True, ) paths.append(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.visualize: plot_problem_paths(env, paths, fig_dir, show_fig=False)
def simulate_policy(args): data = torch.load(str(args.file)) #data = joblib.load(str(args.file)) policy = data['evaluation/policy'] env = NormalizedBoxEnv(HalfCheetahEnv()) #env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() if args.collect: data = [] for trial in tqdm(range(100)): path = rollout( env, policy, max_path_length=args.H + 1, render=not args.collect, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if args.collect: data.append([path['actions'], path['next_observations']]) if args.collect: import pickle with open("data/expert.pkl", mode='wb') as f: pickle.dump(data, f)
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, ): actions = [] paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) path_len = len(path['actions']) if (path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths): break actions.extend(path['actions']) num_steps_collected += path_len paths.append(path) self._actions = actions self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, ): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) path_len = len(path['actions']) # calculate advantages and add column to path path = self.add_advantages(path, path_len, self.calculate_advantages) if (path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths): break num_steps_collected += path_len paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def collect_new_paths(self, max_path_length, num_steps, discard_incomplete_paths): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout(self._env, self._policy, max_path_length=max_path_length_this_loop, render=self._render and len(paths) == 0) path_len = len(path['actions']) # : we don't want to skip incomplete paths, and in fact don't have a meaningful max path length # if ( # path_len != max_path_length # and not path['terminals'][-1] # and discard_incomplete_paths # ): # break num_steps_collected += path_len paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def simulate_policy(args): #data = torch.load(args.file) variant, data = doc.load_rklit_file(args.session_name) if args.mode == 'eval': policy = data['evaluation/policy'] elif args.mode == 'expl': policy = data['exploration/policy'] else: policy = None #env = data['evaluation/env'] environment = stuff.NormalizedActions( env.DeepBuilderEnv(args.session_name, 6, 7, 20, 12)) environment.env.is_simulation = args.simulation == 1 print("Policy loaded") set_gpu_mode(True) policy.cuda() while True: path = rollout( environment, policy, #max_path_length=args.H, render=False, ) if hasattr(env, "log_diagnostics"): environment.log_diagnostics([path]) logger.dump_tabular()
def validate(policy, envs, horizon): """ Collect list of stats for each validation env as dict of following format: 'pickup_wood': [0, 15, 20] means you picked up a wood object at timesteps 0, 15, and 20. """ stats = [{} for _ in range(len(envs))] for env_idx, env in enumerate(envs): path = rollout(env, policy, horizon) for typ in env.object_to_idx.keys(): if typ in TYPE_TO_CLASS_ABS and TYPE_TO_CLASS_ABS[typ]().can_mine( env): key = 'pickup_%s' % typ last_val = 0 pickup_idxs = [] for t, env_info in enumerate(path['env_infos']): count = env_info[key] - last_val pickup_idxs.extend([t for _ in range(count)]) last_val = env_info[key] stats[env_idx][key] = pickup_idxs for typ in env.interactions.values(): key = 'made_%s' % typ last_val = 0 made_idxs = [] for t, env_info in enumerate(path['env_infos']): count = env_info[key] - last_val made_idxs.extend([t for _ in range(count)]) last_val = env_info[key] stats[env_idx][key] = made_idxs return stats
def experiment(My_args): args = getArgs() expl_env = environment(args) # expl_env.render() My_args.file = '/home/yujr/rlkit/data/Test/Test_2020_06_08_21_52_33_0000--s-79802/params.pkl' data = torch.load(My_args.file) print("data loaded", data['evaluation/policy']) policy = data['evaluation/policy'] print("Policy loaded") if My_args.gpu: set_gpu_mode(True) policy.cuda() while True: path = rollout( expl_env, policy, max_path_length=My_args.H, # render=True, ) print('path') # if hasattr(env, "log_diagnostics"): # env.log_diagnostics([path]) logger.dump_tabular()
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, ): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) # 单条轨迹数据 path_len = len(path['actions']) if (path_len != max_path_length # 单条轨迹采集数量是否够数 and not path['terminals'][-1] # 末端状态是否terminal==1 and discard_incomplete_paths # 是否抛弃该条轨迹, 这条轨迹结束原因不明,可能发生了环境bug ): break num_steps_collected += path_len paths.append(path) # 存储多条轨迹 self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend( paths) # 轨迹队列里存储 最多 _max_num_epoch_paths_saved 条轨迹 # 返回本回生成的多条轨迹 return paths
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, policy_fn=None, ): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) path_len = len(path['actions']) if (path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths): break num_steps_collected += path_len ## Used to sparsify reward if self._sparse_reward: random_noise = np.random.normal(size=path['rewards'].shape) path['rewards'] = path['rewards'] + 1.0 * random_noise paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] if args.gpu: ptu.set_gpu_mode(True) policy.cuda() print("set gpu") print(ptu.device) config_file = get_config_file(args.config_file) env = NormalizedBoxEnv( load_env(args, config_file, args.env_mode, ptu.device.index)) print("Policy loaded") while True: path = rollout( env, policy, max_path_length=args.H, render=False, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() paths = [] while True: path = rollout( env, policy, max_path_length=args.H, render=True, ) paths.append(path) if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths) if hasattr(env, "get_diagnostics"): for k, v in env.get_diagnostics(paths).items(): logger.record_tabular(k, v) else: logger.record_dict( eval_util.get_generic_path_information(paths), prefix="evaluation/", ) logger.dump_tabular()
def get_validation_returns(self, snapshot): policy = snapshot['evaluation/policy'] policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(self.eval_env.action_space, 0.1), policy) validation_envs = pickle.load(open(self.validation_envs_pkl, 'rb')) returns = np.zeros(len(validation_envs['envs'])) for env_idx, env in enumerate(validation_envs['envs']): path = rollout(env, policy, self.validation_rollout_length) returns[env_idx] = path['rewards'].sum() return {'returns': returns.mean()}
def simulate_policy(args): data = pickle.load(open(args.file, "rb")) policy_key = args.policy_type + '/policy' if policy_key in data: policy = data[policy_key] else: raise Exception("No policy found in loaded dict. Keys: {}".format( data.keys())) env_key = args.env_type + '/env' if env_key in data: env = data[env_key] else: raise Exception("No environment found in loaded dict. Keys: {}".format( data.keys())) if isinstance(env, RemoteRolloutEnv): env = env._wrapped_env print("Policy loaded") if args.enable_render: # some environments need to be reconfigured for visualization env.enable_render() if args.gpu: ptu.set_gpu_mode(True) if hasattr(policy, "to"): policy.to(ptu.device) if hasattr(env, "vae"): env.vae.to(ptu.device) if args.deterministic: policy = MakeDeterministic(policy) if args.pause: import ipdb ipdb.set_trace() if isinstance(policy, PyTorchModule): policy.train(False) paths = [] while True: paths.append( rollout( env, policy, max_path_length=args.H, render=not args.hide, )) if args.log_diagnostics: if hasattr(env, "log_diagnostics"): env.log_diagnostics(paths, logger) for k, v in eval_util.get_generic_path_information(paths).items(): logger.record_tabular(k, v) logger.dump_tabular()
def simulate_policy(args): filename = args.file data = torch.load(filename) filename_token = filename.split('/') save_path = os.path.join( os.path.join(*filename_token[:-1]), 'offline_buffer_' + filename_token[-1].split('.')[0] + '.hdf5') print(save_path) print(data) ''' I don't know why but they did not save the policy for evalutaion. Instead of that, I used trainer/policy ''' policy = data['trainer/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() n = 0 traj_list = [] while n < args.buffer_size: path = rollout( env, policy, max_path_length=args.H, render=False, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() n = n + len(path['rewards']) print('Saving %d sequences' % n) traj_list.append(path) # Visualize one trajectory if args.visualize: states = path['observations'] states = np.concatenate( [states, path['next_observations'][-1:, :]], axis=0) gr = 0.1 # goal radius, for visualization purposes g = np.array([1.0, 1.0]) plt.figure(figsize=(8, 8)) axes = plt.axes() axes.set(aspect='equal') plt.axis([-0.25, 1.25, -0.25, 1.25]) circle = plt.Circle((g[0], g[1]), radius=gr) axes.add_artist(circle) plt.plot(states[:-1, 0], states[:-1, 1], '-o') plt.plot(states[-1, 0], states[-1, 1], '-x', markersize=20) plt.show() '''
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() while True: path = rollout( env, policy, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def collect_new_paths(self, max_path_length, num_steps): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: path = rollout( self._env, self._policy, max_path_length=min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ), ) num_steps_collected += len(path['actions']) paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def collect_new_paths( self, max_path_length, num_eps ): paths = [] ep_collected = 0 fails = 0 num_steps_collected = 0 while ep_collected < num_eps: path = rollout( self._env, self._policy, max_path_length=max_path_length ) path_len = len(path['actions']) ep_collected += 1 paths.append(path) self._num_steps_total += num_steps_collected self._num_paths_total += len(paths) self._epoch_paths.extend(paths) last_rewards = [path["rewards"][-1] for path in paths] returns = [sum(path["rewards"]) for path in paths] lengths = [len(path["actions"]) for path in paths] terminals = [path['terminals'][-1] for path in paths] # passed criterion solved = False if self.pass_criterion(returns, lengths, terminals, last_rewards, max_path_length): print("Solved") solved = True # # reach the end # def criterion(path): # return path['terminals'] and len(path['rewards']) <= max_path_length and \ # path['reward'][-1] != self.terminal_reward # finished = sum([1 if criterion(path) else 0 for path in paths]) # if finished == len(paths): # solved = True return paths, solved
def collect_new_paths( self, max_path_length, num_paths, ): paths = [] num_steps_collected = 0 for _ in range(num_paths): path = rollout( self._env, self._policy, max_path_length=max_path_length, ) path_len = len(path['actions']) num_steps_collected += path_len paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def collect_new_paths( self, max_path_length, num_steps, discard_incomplete_paths, ): paths = [] num_steps_collected = 0 while num_steps_collected < num_steps: max_path_length_this_loop = min( # Do not go over num_steps max_path_length, num_steps - num_steps_collected, ) path = rollout( self._env, self._policy, max_path_length=max_path_length_this_loop, ) path_len = len(path['actions']) if ( path_len != max_path_length and not path['terminals'][-1] and discard_incomplete_paths ): break num_steps_collected += path_len ## Used to sparsify reward if self._sparse_reward: random_noise = np.random.normal(size=path['rewards'].shape) path['rewards'] = path['rewards'] + 1.0*random_noise # bins = np.array([-10, -0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) # temp_rewards = np.cast(path['rewards']/2.0, ) # temp_rewards = (path['rewards'] > 1.0) # path['rewards'] = temp_rewards.astype(np.float32) paths.append(path) self._num_paths_total += len(paths) self._num_steps_total += num_steps_collected self._epoch_paths.extend(paths) return paths
def simulate_policy(args): data = torch.load(args.file) policy = data['evaluation/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() num_fail = 0 for _ in range(args.ep): path = rollout( env, policy, max_path_length=args.H, render=False, sleep=args.S, ) if np.any(path['rewards'] == -1): num_fail += 1 if args.de: last_obs = np.moveaxis( np.reshape(path['observations'][-1], (3, 33, 33)), 0, -1) last_next_obs = np.moveaxis( np.reshape(path['next_observations'][-1], (3, 33, 33)), 0, -1) last_obs = (last_obs * 33 + 128).astype(np.uint8) last_next_obs = (last_next_obs * 33 + 128).astype(np.uint8) fig = plt.figure(figsize=(10, 10)) fig.add_subplot(2, 1, 1) plt.imshow(last_obs) fig.add_subplot(2, 1, 2) plt.imshow(last_next_obs) plt.show() plt.close() if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() print('number of failures:', num_fail)
def pretrain(self): if (self.num_paths_for_normalization == 0 or (self.obs_normalizer is None and self.action_normalizer is None)): return pretrain_paths = [] random_policy = RandomPolicy(self.env.action_space) while len(pretrain_paths) < self.num_paths_for_normalization: path = rollout(self.env, random_policy, self.max_path_length) pretrain_paths.append(path) ob_mean, ob_std, ac_mean, ac_std = ( compute_normalization(pretrain_paths)) if self.obs_normalizer is not None: self.obs_normalizer.set_mean(ob_mean) self.obs_normalizer.set_std(ob_std) self.target_qf.obs_normalizer = self.obs_normalizer self.target_policy.obs_normalizer = self.obs_normalizer if self.action_normalizer is not None: self.action_normalizer.set_mean(ac_mean) self.action_normalizer.set_std(ac_std) self.target_qf.action_normalizer = self.action_normalizer self.target_policy.action_normalizer = self.action_normalizer
def collect_new_paths(self, max_path_length, num_steps, discard_incomplete_paths, continuing=False): if not continuing: # reset held state re: env and obs since we're resetting now self.curr_env = self._env self.last_obs = None path, self.curr_env, self.last_obs = rollout( self.curr_env, self._policy, # : this is not a typo max_path_length=num_steps, render=self._render, return_env_obs=True, continuing=continuing, obs=self.last_obs) path_len = len(path['actions']) self._num_paths_total += 1 self._num_steps_total += path_len self._epoch_paths.append(path) return path
def simulate_policy(args): data = torch.load(args.file) print(data) # policy = data['evaluation/policy'] ''' I don't know why but they did not save the policy for evalutaion. Instead of that, I used trainer/policy ''' policy = data['trainer/policy'] env = data['evaluation/env'] print("Policy loaded") if args.gpu: set_gpu_mode(True) policy.cuda() while True: path = rollout( env, policy, max_path_length=args.H, render=True, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular()
def validate(self, snapshot): """ Collect list of stats for each validation env as dict of following format: 'pickup_wood': [0, 15, 20] means you picked up a wood object at timesteps 0, 15, and 20. """ policy = snapshot['evaluation/policy'] if hasattr(policy, 'policy'): # if it's reset free, strip out the underlying policy from the exploration strategy policy = policy.policy policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(self.eval_env.action_space, 0.1), policy) validation_envs = pickle.load(open(self.validation_envs_pkl, 'rb')) stats = [{} for _ in range(len(validation_envs['envs']))] for env_idx, env in enumerate(validation_envs['envs']): path = rollout(env, policy, self.validation_rollout_length) for typ in env.object_to_idx.keys(): if typ not in ['empty', 'wall', 'tree']: key = 'pickup_%s' % typ last_val = 0 pickup_idxs = [] for t, env_info in enumerate(path['env_infos']): count = env_info[key] - last_val pickup_idxs.extend([t for _ in range(count)]) last_val = env_info[key] stats[env_idx][key] = pickup_idxs for typ in env.interactions.values(): key = 'made_%s' % typ last_val = 0 made_idxs = [] for t, env_info in enumerate(path['env_infos']): count = env_info[key] - last_val made_idxs.extend([t for _ in range(count)]) last_val = env_info[key] stats[env_idx][key] = made_idxs return stats
def offpolicy_inference(): import time from gym import wrappers filename = str(uuid.uuid4()) gpu = True env, _, _ = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) snapshot = torch.load(args.load_name) policy = snapshot['evaluation/policy'] if args.env_name.find('doorenv') > -1: policy.knob_noisy = args.knob_noisy policy.nn = env._wrapped_env.nn policy.visionnet_input = env_kwargs['visionnet_input'] epi_counter = 1 dooropen_counter = 0 total_time = 0 test_num = 100 if evaluation: render = False else: if not args.unity: render = True else: render = False start_time = int(time.mktime(time.localtime())) if gpu: set_gpu_mode(True) while True: if args.env_name.find('doorenv') > -1: path, door_opened, opening_time = rollout( env, policy, max_path_length=512, doorenv=True, render=render, evaluate=True, ) print("done first") if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: env, _, _ = prepare_env(args.env_name, args.visionmodel_path, **env_kwargs) if door_opened: dooropen_counter += 1 total_time += opening_time eval_print(dooropen_counter, epi_counter, start_time, total_time) else: path = rollout( env, policy, max_path_length=512, doorenv=False, render=render, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) epi_counter += 1 if args.env_name.find('door') > -1 and epi_counter > test_num: eval_print(dooropen_counter, epi_counter, start_time, total_time) break
parser.add_argument('--log_dir', type=str, default='PPO') parser.add_argument('--file', type=str, default='params') parser.add_argument('--epoch', type=int, default=None) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--neval', type=int, default=100) args = parser.parse_args() pre_dir = './Data/' + args.exp_name + args.extra_name data_path = '{}/{}/seed{}/{}.pkl'.format(pre_dir, args.log_dir, args.seed, args.file) data = torch.load(data_path, map_location='cpu') policy = data['trainer/policy'] policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) import sys from traffic.make_env import make_env import json with open('{}/{}/seed{}/variant.json'.format(pre_dir, args.log_dir, args.seed)) as f: variant = json.load(f) env = make_env(args.exp_name, **variant['env_kwargs']) returns = [] for i in range(args.neval): path = rollout(env, policy, max_path_length=200) ret = np.sum(path['rewards']) returns.append(ret) print(np.mean(returns), np.std(returns))
def offpolicy_inference(seed, env_name, det, load_name, evaluation, render, knob_noisy, visionnet_input, env_kwargs, actor_critic=None, verbose=True, pos_control=True, step_skip=4): import time from gym import wrappers print("evaluatin started!") filename = str(uuid.uuid4()) gpu = True env, _, _ = prepare_env(env_name, **env_kwargs) if not actor_critic: snapshot = torch.load(load_name) policy = snapshot['evaluation/policy'] else: policy = actor_critic if env_name.find('doorenv') > -1: policy.knob_noisy = knob_noisy policy.nn = env._wrapped_env.nn policy.visionnet_input = env_kwargs['visionnet_input'] epi_counter = 1 dooropen_counter = 0 total_time = 0 test_num = 100 start_time = int(time.mktime(time.localtime())) if gpu: set_gpu_mode(True) while True: # print("new env") if env_name.find('doorenv') > -1: if evaluation: path, door_opened, opening_time = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() # if evaluation: # print("1") env, _, _ = prepare_env(env_name, **env_kwargs) if door_opened: dooropen_counter += 1 total_time += opening_time if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format( epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) else: path = rollout( env, policy, max_path_length=512, render=render, evaluate=evaluation, verbose=True, doorenv=True, pos_control=pos_control, step_skip=step_skip, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() else: path = rollout( env, policy, max_path_length=512, doorenv=False, render=render, ) if hasattr(env, "log_diagnostics"): env.log_diagnostics([path]) logger.dump_tabular() if evaluation: if verbose: print("{} ep end >>>>>>>>>>>>>>>>>>>>>>>>".format(epi_counter)) eval_print(dooropen_counter, epi_counter, start_time, total_time) epi_counter += 1 if env_name.find('door') > -1 and epi_counter > test_num: if verbose: print("dooropening counter:", dooropen_counter, " epi counter:", epi_counter) eval_print(dooropen_counter, epi_counter, start_time, total_time) break opening_rate, opening_timeavg = eval_print(dooropen_counter, epi_counter - 1, start_time, total_time) return opening_rate, opening_timeavg
def get_gifs_heatmaps(exps_dir_name, seeds, save_dir, titles): data_dir = join(get_repo_dir(), 'data') exps_dir = join(data_dir, exps_dir_name) gifs_dir = join(data_dir, 'gifs') heat_dir = join(data_dir, 'heatmaps') # load variant and get pickled validation envs rand_exp_dir = glob(join(exps_dir, '*'))[0] with open(join(rand_exp_dir, 'variant.json'), 'r') as f: variant = json.load(f) task_obj = variant['env_kwargs']['task'].split()[1] val_envs_path = variant['algo_kwargs']['algorithm_kwargs'][ 'validation_envs_pkl'] val_rollout_len = variant['algo_kwargs']['algorithm_kwargs'][ 'validation_rollout_length'] val_envs = get_val_envs(val_envs_path) # load policy for seed_idx, seed in enumerate(seeds): val_env_idxs = random.sample(list(range(len(val_envs))), 10) exp_dir = glob(join(exps_dir, '*%d' % seed))[0] """ Get policy """ pol_file = max(glob(join(exp_dir, 'itr_*.pkl')), key=lambda pol_path: int(basename(pol_path)[4:-4])) # to override policy itr number # pol_file = join(exp_dir, 'itr_%d.pkl' % 2990) print(pol_file) with open(pol_file, 'rb') as f: policy = pickle.load(f)['evaluation/policy'] if hasattr(policy, 'policy'): # if it's reset free, strip out the underlying policy from the exploration strategy policy = policy.policy policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(spaces.Discrete(7), 0.1), policy) # re-fetch the val envs each time so that envs are fresh # val_envs = get_val_envs(val_envs_path) # """ Get gifs """ # stats = [{} for _ in range(len(val_env_idxs))] # for meta_idx, env_idx in enumerate(val_env_idxs): # env = val_envs[env_idx] # path = rollout(env, policy, val_rollout_len, render=True, save=True, # save_dir=join(gifs_dir, exps_dir_name, save_dir, str(seed), str(env_idx))) # env.render(close=True) # for typ in env.object_to_idx.keys(): # if typ not in ['empty', 'wall', 'tree']: # key = 'pickup_%s' % typ # last_val = 0 # pickup_idxs = [] # for t, env_info in enumerate(path['env_infos']): # count = env_info[key] - last_val # pickup_idxs.extend([t for _ in range(count)]) # last_val = env_info[key] # stats[meta_idx][key] = pickup_idxs # for typ in env.interactions.values(): # key = 'made_%s' % typ # last_val = 0 # made_idxs = [] # for t, env_info in enumerate(path['env_infos']): # count = env_info[key] - last_val # made_idxs.extend([t for _ in range(count)]) # last_val = env_info[key] # stats[meta_idx][key] = made_idxs # solved = [val_env_idxs[i] for i, stat in enumerate(stats) if stat['pickup_%s' % task_obj]] # print('seed %d solved %d percent:' % (seed, 100 * len(solved) // len(val_env_idxs))) # print(solved) # re-fetch the val envs each time so that envs are fresh val_envs = get_val_envs(val_envs_path) print('refetched envs') """ Get heatmaps """ vcs = [] for env_idx, env in enumerate(val_envs): path = rollout(env, policy, val_rollout_len) vcs.append(env.visit_count) visit_count_sum = sum(vcs) plt.imshow(visit_count_sum) plt.title('Validation Tasks State Visitation Count (%s)' % titles[seed_idx]) plt.axis('off') vc_save_path = join(heat_dir, exps_dir_name, save_dir, str(seed)) os.makedirs(vc_save_path, exist_ok=True) plt.savefig(join(vc_save_path, 'map.png'))