def exp4_loop(env, policy, models_path, covers_path, ngoals, max_steps, semi_metric, vis=False, eps_greedy=False): recall_at_epoch = [] hit_time_at_epoch = [] model_epochs = paper_utils.list_epochs(models_path) cover_epochs = paper_utils.list_epochs(covers_path) model_epochs = [epoch for epoch in model_epochs if epoch % 25 == 0] cover_epochs = [epoch for epoch in cover_epochs if epoch % 25 == 0] n_epochs = np.minimum(len(model_epochs), len(cover_epochs)) epochs = model_epochs[:n_epochs] for epoch_idx in epochs: cover_path = f"{covers_path}/epoch_{epoch_idx}.json" scrb = MetricDiversifier(k=100, load_model=cover_path, reward_func=None) ngoals = np.minimum(ngoals, scrb.k) paper_utils.load_model( load_path=f"{models_path}/epoch_{epoch_idx}.model") pnts = scrb.draw(ngoals, replace=False) reached = np.zeros(len(pnts)) hit_time = [max_steps for _ in range(len(pnts))] for pidx, pnt in enumerate(pnts): goal = pnt['ag'] if reached[pidx]: continue if semi_metric: obs = reset_env(env, scrb=scrb, mode='intrinsic') else: refidx = pidx while refidx == pidx: refidx = random.choice([i for i in range(len(pnts))]) refpnt = pnts[refidx] obs = init_from_point(env, refpnt) env.env.set_goal(goal=np.asarray(goal)) for t in range(max_steps): if reached[pidx]: break if vis: env.render() time.sleep(.01) action, _, state, _ = policy.step(obs) if eps_greedy and t % 10 == 0: action = env.action_space.sample() obs, reward, done, info = env.step(action) if info['is_success']: reached[pidx] = 1 hit_time[pidx] = t recall_at_epoch.append(reached.mean()) hit_time_at_epoch.append(np.mean(hit_time)) return epochs, recall_at_epoch, hit_time_at_epoch
def exp1_overlayed_figure(env, scrb: MetricDiversifier, save_directory, message): reset_env(env, scrb, mode='intrinsic') rooms_layer = env.env._get_rooms_image() agent_layer = env.env._get_agent_image() for pidx in scrb.used_slots(): # obs = reset_env(env, scrb, mode='intrinsic') init_from_point(env, scrb.buffer[pidx]) agent_p = env.env._get_agent_image() agent_layer += agent_p agent_layer = (255 * (agent_layer / agent_layer.max())).astype(np.int32) frame = np.concatenate([agent_layer, 0 * rooms_layer, rooms_layer], axis=2) for i in range(frame.shape[0]): for j in range(frame.shape[1]): if frame[i, j, :].sum() == 0: frame[i, j] = 255 fig, ax = plt.subplots(1, 1) plt.imshow(frame) if not os.path.exists(save_directory): os.makedirs(save_directory) fig_name = f"{save_directory}/{message}_frame.png" ax.set_xticks([], []) ax.set_yticks([], []) plt.tight_layout() plt.savefig(fig_name) print(f"saved figure : {fig_name}")
def play_policy(env, env_id, T=20, load_path=None, cover_path=None, semi_metric=False, eps_greedy=False, **kwargs): policy, reward_fun = paper_utils.load_policy(env_id, **kwargs) paper_utils.load_model(load_path=load_path) scrb = MetricDiversifier(k=100, load_model=cover_path, reward_func=None) obs = reset_env(env, scrb, mode='intrinsic') i = 0 while True: i += 1 env.render() time.sleep(.01) action, _, state, _ = policy.step(obs) if eps_greedy and i % 10 == 0: action = env.action_space.sample() obs, reward, done, info = env.step(action) success = info['is_success'] timeout = i % T == 0 done = success or timeout if done: # input(f"success: {success}, invalid: {invalid}, timeout: {timeout}") if scrb is None or semi_metric: reset_env(env, scrb, mode='intrinsic') else: reset_env(env, scrb, mode='extrinsic') obs = set_goal(env, scrb) i = 0 env.close()
def make_thunk(k): return lambda: MetricDiversifier(k=k, vis=vis, vis_coords=vis_coords, load_model=load_path, save_path=f"{log_path}/{k}/mca_cover", random_cover=random_cover, load_p=load_prob, phase_length=phase_length, dilute_at_goal=dilute_at_goal)
def scan_cover(env, action_repetition=1, cover_path=None, **kwargs): scrb = MetricDiversifier(k=100, load_model=cover_path, reward_func=None) obs = reset_env(env, scrb, mode='intrinsic') for i in range(100000): env.render() time.sleep(.1) if i % action_repetition == 0: a = env.action_space.sample() obs, reward, done, info = env.step(a) if i % 1 == 0: ob = reset_env(env, scrb, mode='extrinsic') # print(np.linalg.norm(ob["qvel"])) time.sleep(.5) env.close()
def exp3_loop(env, policy, models_path, covers_path, ngoals, max_steps, semi_metric, vis=False, eps_greedy=False): variance_at_epoch = [] min_dists = [] hit_times = [] epochs = paper_utils.list_epochs(covers_path) epochs.sort() epochs = [epoch for epoch in epochs if epoch % 25 == 0] # epochs = epochs[:2] for epoch_idx in epochs: model_path = f"{models_path}/epoch_{epochs[-1]}.model" paper_utils.load_model(load_path=model_path) cover_path = f"{covers_path}/epoch_{epoch_idx}.json" scrb = MetricDiversifier(k=100, vis=False, vis_coords=[0, 1], save_path=None, load_model=cover_path, reward_func=None) min_dist = scrb.M.min() pnts = scrb.draw(ngoals, replace=False) reached = np.zeros(len(pnts)) hit_time = [max_steps for _ in range(ngoals)] reached_list = [] for pidx, pnt in enumerate(pnts): goal = pnt['ag'] if reached[pidx]: continue if semi_metric: obs = reset_env(env, scrb=scrb, mode='intrinsic') else: refidx = pidx while refidx == pidx: refidx = random.choice([i for i in range(len(pnts))]) refpnt = pnts[refidx] obs = init_from_point(env, refpnt) env.env.set_goal(goal=np.asarray(goal)) for t in range(max_steps): if reached[pidx]: break if vis: env.render() time.sleep(.01) action, _, state, _ = policy.step(obs) if eps_greedy and t % 10 == 0: action = env.action_space.sample() obs, reward, done, info = env.step(action) if info['is_success']: reached[pidx] = 1 reached_list.append(goal) hit_time[pidx] = t if len(reached_list) == 0: variance_at_epoch.append(0) else: variance_at_epoch.append(np.asarray(reached_list).std()) min_dists.append(min_dist) hit_times.append(np.mean(hit_time)) return epochs, variance_at_epoch, min_dists, hit_times
def experiment1(env, env_id, T=100, k=50, load_path=None, save_path=None, semi_metric=False, eps_greedy=False, dilute_overlaps=True, ntrials=5, nsteps=10000, random_mode=False, **kwargs): policy, reward_fun = paper_utils.load_policy(env_id, **kwargs) paper_utils.load_model(load_path=load_path) if semi_metric: metric_str = "semi_metric" else: metric_str = "full_metric" for random_mode in [True, False]: if random_mode: random_str = 'random' alpha = 0 else: random_str = 'scrb' alpha = 0.5 log_path = f"{save_path}/{metric_str}_{random_str}" results = dict() k_vec = [10, 20, 30, 40, 50] # k_vec = [50] for k in k_vec: results[k] = dict() k_radii = [] for trial_idx in range(ntrials): scrb = MetricDiversifier(k=k, vis=False, dilute_overlaps=dilute_overlaps, vis_coords=[0, 1], save_path=log_path, reward_func=reward_fun, random_mode=random_mode) times, radii = exp1_loop(env, scrb, policy, eps_greedy, T, semi_metric, nsteps) k_radii.append(radii) print( f"k: {k}, trial: {trial_idx}/{ntrials}, nsteps: {nsteps}") results[k]["mean"] = np.asarray(k_radii).mean(axis=0) results[k]["std"] = np.asarray(k_radii).std(axis=0) results[k]["time"] = times paper_utils.exp1_to_figure(results, save_directory=log_path, alpha=alpha, message=f"{metric_str}_{random_str}") exp1_loop(env, scrb, policy, eps_greedy, T, semi_metric, 50) paper_utils.exp1_overlayed_figure(env, scrb, save_directory=log_path, message=f"{metric_str}_{random_str}")
def learn( *, network, env, mca_env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=25, clip_return=True, demo_file=None, override_params=None, load_path=None, log_path=None, # save_path=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # assert operation mode assert kwargs["mode"] in ["basic", "exploration_module", "maximum_span"] if kwargs["mode"] == "basic": kwargs["mca_state_model"] = None def prepare_agent(_env, eval_env, active, exploration='eps_greedy', action_l2=None, scope=None, ss=False, load_path=None): # Prepare params. _params = copy.deepcopy(config.DEFAULT_PARAMS) _kwargs = copy.deepcopy(kwargs) _override_params = copy.deepcopy(override_params) env_name = _env.spec.id _params['env_name'] = env_name _params['replay_strategy'] = replay_strategy _params['ss'] = ss if action_l2 is not None: _params['action_l2'] = action_l2 if not active: _params["buffer_size"] = 1 if env_name in config.DEFAULT_ENV_PARAMS: _params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in _params.update( **_override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(_params, f) _params = config.prepare_params(_params) _params['rollout_batch_size'] = _env.num_envs if demo_file is not None: _params['bc_loss'] = 1 _params.update(_kwargs) config.log_params(_params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims, coord_dict = config.configure_dims(_params) _params['ddpg_params']['scope'] = scope policy, reward_fun = config.configure_ddpg(dims=dims, params=_params, active=active, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) print(f"Loaded model: {load_path}") rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'exploration': exploration } eval_params = { 'exploit': True, 'use_target_net': _params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = _params[name] eval_params[name] = _params[name] eval_env = eval_env or _env rollout_worker = RolloutWorker(_env, policy, dims, logger, active, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, active, **eval_params) return policy, rollout_worker, evaluator, _params, coord_dict, reward_fun active = kwargs["mode"] in ["basic", "exploration_module"] policy, rollout_worker, evaluator, params, *_ = prepare_agent( env, eval_env, active=active, scope="main") n_cycles = params['n_cycles'] ############################################################################## # Maximum Coverage Agent mca_active = kwargs["mode"] in ["exploration_module", "maximum_span"] mca_load_path = set_default_value(kwargs, 'mca_load_path', None) mca_exploration = set_default_value(kwargs, 'mca_exploration', 'eps_greedy') mca_action_l2 = set_default_value(kwargs, 'mca_action_l2', 1) ss = set_default_value(kwargs, 'ss', False) trainable = set_default_value(kwargs, 'trainable', True) random_cover = set_default_value(kwargs, 'random_cover', False) semi_metric = set_default_value(kwargs, 'semi_metric', False) k = set_default_value(kwargs, 'k', 1000) feature_w = set_default_value(params, 'feature_w', None) invalidate_episodes = set_default_value(kwargs, 'invalidate_episodes', False) alpha = set_default_value(kwargs, 'alpha', 0.5) nscrb_updates = set_default_value(kwargs, 'nscrb_updates', 1000) mca_policy, mca_rw, mca_evaluator, mca_params, coord_dict, reward_fun = prepare_agent( mca_env, eval_env, active=mca_active, exploration=mca_exploration, action_l2=mca_action_l2, scope="mca", ss=ss, load_path=mca_load_path) if semi_metric: ncells = rollout_worker.T else: ncells = 1 state_model_vec = [] for cidx in range(ncells): state_model_vec.append( MetricDiversifier( k=k, reward_func=reward_fun, vis=False, feature_w=feature_w, vis_coords=coord_dict['vis'], load_model=kwargs['load_mca_path'], save_path=f"{log_path}/{cidx}/mca_cover", random_cover=random_cover, load_p=1, )) mca = MCA(policy=mca_policy, semi_metric=semi_metric, rollout_worker=mca_rw, evaluator=mca_evaluator, state_model=state_model_vec, coord_dict=coord_dict, active=(alpha > 0)) ############################################################################## if 'n_epochs' not in kwargs: n_epochs = total_timesteps // n_cycles // rollout_worker.T // mca_rw.rollout_batch_size else: n_epochs = int(kwargs['n_epochs']) return train(save_path=log_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file, mca=mca, random_cover=random_cover, trainable=trainable, cover_measure_env=kwargs['cover_measure_env'], invalidate_episodes=invalidate_episodes, alpha=alpha, nscrb_updates=nscrb_updates)