예제 #1
0
def exp4_loop(env,
              policy,
              models_path,
              covers_path,
              ngoals,
              max_steps,
              semi_metric,
              vis=False,
              eps_greedy=False):
    recall_at_epoch = []
    hit_time_at_epoch = []
    model_epochs = paper_utils.list_epochs(models_path)
    cover_epochs = paper_utils.list_epochs(covers_path)

    model_epochs = [epoch for epoch in model_epochs if epoch % 25 == 0]
    cover_epochs = [epoch for epoch in cover_epochs if epoch % 25 == 0]
    n_epochs = np.minimum(len(model_epochs), len(cover_epochs))

    epochs = model_epochs[:n_epochs]
    for epoch_idx in epochs:

        cover_path = f"{covers_path}/epoch_{epoch_idx}.json"
        scrb = MetricDiversifier(k=100,
                                 load_model=cover_path,
                                 reward_func=None)
        ngoals = np.minimum(ngoals, scrb.k)
        paper_utils.load_model(
            load_path=f"{models_path}/epoch_{epoch_idx}.model")
        pnts = scrb.draw(ngoals, replace=False)
        reached = np.zeros(len(pnts))
        hit_time = [max_steps for _ in range(len(pnts))]
        for pidx, pnt in enumerate(pnts):
            goal = pnt['ag']
            if reached[pidx]:
                continue
            if semi_metric:
                obs = reset_env(env, scrb=scrb, mode='intrinsic')
            else:
                refidx = pidx
                while refidx == pidx:
                    refidx = random.choice([i for i in range(len(pnts))])
                refpnt = pnts[refidx]
                obs = init_from_point(env, refpnt)
            env.env.set_goal(goal=np.asarray(goal))
            for t in range(max_steps):
                if reached[pidx]:
                    break
                if vis:
                    env.render()
                    time.sleep(.01)
                action, _, state, _ = policy.step(obs)
                if eps_greedy and t % 10 == 0:
                    action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if info['is_success']:
                    reached[pidx] = 1
                    hit_time[pidx] = t
        recall_at_epoch.append(reached.mean())
        hit_time_at_epoch.append(np.mean(hit_time))
    return epochs, recall_at_epoch, hit_time_at_epoch
예제 #2
0
def exp1_overlayed_figure(env, scrb: MetricDiversifier, save_directory,
                          message):

    reset_env(env, scrb, mode='intrinsic')

    rooms_layer = env.env._get_rooms_image()
    agent_layer = env.env._get_agent_image()

    for pidx in scrb.used_slots():
        # obs = reset_env(env, scrb, mode='intrinsic')
        init_from_point(env, scrb.buffer[pidx])
        agent_p = env.env._get_agent_image()
        agent_layer += agent_p

    agent_layer = (255 * (agent_layer / agent_layer.max())).astype(np.int32)
    frame = np.concatenate([agent_layer, 0 * rooms_layer, rooms_layer], axis=2)
    for i in range(frame.shape[0]):
        for j in range(frame.shape[1]):
            if frame[i, j, :].sum() == 0:
                frame[i, j] = 255

    fig, ax = plt.subplots(1, 1)
    plt.imshow(frame)

    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    fig_name = f"{save_directory}/{message}_frame.png"
    ax.set_xticks([], [])
    ax.set_yticks([], [])
    plt.tight_layout()
    plt.savefig(fig_name)

    print(f"saved figure : {fig_name}")
예제 #3
0
def play_policy(env,
                env_id,
                T=20,
                load_path=None,
                cover_path=None,
                semi_metric=False,
                eps_greedy=False,
                **kwargs):
    policy, reward_fun = paper_utils.load_policy(env_id, **kwargs)
    paper_utils.load_model(load_path=load_path)
    scrb = MetricDiversifier(k=100, load_model=cover_path, reward_func=None)
    obs = reset_env(env, scrb, mode='intrinsic')
    i = 0
    while True:
        i += 1
        env.render()
        time.sleep(.01)
        action, _, state, _ = policy.step(obs)
        if eps_greedy and i % 10 == 0:
            action = env.action_space.sample()
        obs, reward, done, info = env.step(action)

        success = info['is_success']
        timeout = i % T == 0
        done = success or timeout
        if done:
            # input(f"success: {success}, invalid: {invalid}, timeout: {timeout}")
            if scrb is None or semi_metric:
                reset_env(env, scrb, mode='intrinsic')
            else:
                reset_env(env, scrb, mode='extrinsic')
            obs = set_goal(env, scrb)
            i = 0
    env.close()
예제 #4
0
 def make_thunk(k):
     return lambda: MetricDiversifier(k=k,
                                      vis=vis,
                                      vis_coords=vis_coords,
                                      load_model=load_path,
                                      save_path=f"{log_path}/{k}/mca_cover",
                                      random_cover=random_cover,
                                      load_p=load_prob,
                                      phase_length=phase_length,
                                      dilute_at_goal=dilute_at_goal)
예제 #5
0
def scan_cover(env, action_repetition=1, cover_path=None, **kwargs):
    scrb = MetricDiversifier(k=100, load_model=cover_path, reward_func=None)
    obs = reset_env(env, scrb, mode='intrinsic')
    for i in range(100000):
        env.render()
        time.sleep(.1)
        if i % action_repetition == 0:
            a = env.action_space.sample()
        obs, reward, done, info = env.step(a)
        if i % 1 == 0:
            ob = reset_env(env, scrb, mode='extrinsic')
            # print(np.linalg.norm(ob["qvel"]))
            time.sleep(.5)
    env.close()
예제 #6
0
def exp3_loop(env,
              policy,
              models_path,
              covers_path,
              ngoals,
              max_steps,
              semi_metric,
              vis=False,
              eps_greedy=False):

    variance_at_epoch = []
    min_dists = []
    hit_times = []
    epochs = paper_utils.list_epochs(covers_path)
    epochs.sort()
    epochs = [epoch for epoch in epochs if epoch % 25 == 0]

    # epochs = epochs[:2]
    for epoch_idx in epochs:
        model_path = f"{models_path}/epoch_{epochs[-1]}.model"
        paper_utils.load_model(load_path=model_path)
        cover_path = f"{covers_path}/epoch_{epoch_idx}.json"
        scrb = MetricDiversifier(k=100,
                                 vis=False,
                                 vis_coords=[0, 1],
                                 save_path=None,
                                 load_model=cover_path,
                                 reward_func=None)
        min_dist = scrb.M.min()
        pnts = scrb.draw(ngoals, replace=False)
        reached = np.zeros(len(pnts))
        hit_time = [max_steps for _ in range(ngoals)]
        reached_list = []
        for pidx, pnt in enumerate(pnts):
            goal = pnt['ag']
            if reached[pidx]:
                continue
            if semi_metric:
                obs = reset_env(env, scrb=scrb, mode='intrinsic')
            else:
                refidx = pidx
                while refidx == pidx:
                    refidx = random.choice([i for i in range(len(pnts))])
                refpnt = pnts[refidx]
                obs = init_from_point(env, refpnt)
            env.env.set_goal(goal=np.asarray(goal))
            for t in range(max_steps):
                if reached[pidx]:
                    break
                if vis:
                    env.render()
                    time.sleep(.01)
                action, _, state, _ = policy.step(obs)
                if eps_greedy and t % 10 == 0:
                    action = env.action_space.sample()
                obs, reward, done, info = env.step(action)
                if info['is_success']:
                    reached[pidx] = 1
                    reached_list.append(goal)
                    hit_time[pidx] = t
        if len(reached_list) == 0:
            variance_at_epoch.append(0)
        else:
            variance_at_epoch.append(np.asarray(reached_list).std())
        min_dists.append(min_dist)
        hit_times.append(np.mean(hit_time))
    return epochs, variance_at_epoch, min_dists, hit_times
예제 #7
0
def experiment1(env,
                env_id,
                T=100,
                k=50,
                load_path=None,
                save_path=None,
                semi_metric=False,
                eps_greedy=False,
                dilute_overlaps=True,
                ntrials=5,
                nsteps=10000,
                random_mode=False,
                **kwargs):

    policy, reward_fun = paper_utils.load_policy(env_id, **kwargs)
    paper_utils.load_model(load_path=load_path)
    if semi_metric:
        metric_str = "semi_metric"
    else:
        metric_str = "full_metric"

    for random_mode in [True, False]:
        if random_mode:
            random_str = 'random'
            alpha = 0
        else:
            random_str = 'scrb'
            alpha = 0.5

        log_path = f"{save_path}/{metric_str}_{random_str}"

        results = dict()
        k_vec = [10, 20, 30, 40, 50]
        # k_vec = [50]
        for k in k_vec:
            results[k] = dict()
            k_radii = []
            for trial_idx in range(ntrials):
                scrb = MetricDiversifier(k=k,
                                         vis=False,
                                         dilute_overlaps=dilute_overlaps,
                                         vis_coords=[0, 1],
                                         save_path=log_path,
                                         reward_func=reward_fun,
                                         random_mode=random_mode)
                times, radii = exp1_loop(env, scrb, policy, eps_greedy, T,
                                         semi_metric, nsteps)
                k_radii.append(radii)
                print(
                    f"k: {k}, trial: {trial_idx}/{ntrials}, nsteps: {nsteps}")
            results[k]["mean"] = np.asarray(k_radii).mean(axis=0)
            results[k]["std"] = np.asarray(k_radii).std(axis=0)
            results[k]["time"] = times

            paper_utils.exp1_to_figure(results,
                                       save_directory=log_path,
                                       alpha=alpha,
                                       message=f"{metric_str}_{random_str}")

        exp1_loop(env, scrb, policy, eps_greedy, T, semi_metric, 50)
        paper_utils.exp1_overlayed_figure(env,
                                          scrb,
                                          save_directory=log_path,
                                          message=f"{metric_str}_{random_str}")
예제 #8
0
def learn(
        *,
        network,
        env,
        mca_env,
        total_timesteps,
        seed=None,
        eval_env=None,
        replay_strategy='future',
        policy_save_interval=25,
        clip_return=True,
        demo_file=None,
        override_params=None,
        load_path=None,
        log_path=None,
        # save_path=None,
        **kwargs):

    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # assert operation mode
    assert kwargs["mode"] in ["basic", "exploration_module", "maximum_span"]

    if kwargs["mode"] == "basic":
        kwargs["mca_state_model"] = None

    def prepare_agent(_env,
                      eval_env,
                      active,
                      exploration='eps_greedy',
                      action_l2=None,
                      scope=None,
                      ss=False,
                      load_path=None):
        # Prepare params.
        _params = copy.deepcopy(config.DEFAULT_PARAMS)
        _kwargs = copy.deepcopy(kwargs)
        _override_params = copy.deepcopy(override_params)

        env_name = _env.spec.id
        _params['env_name'] = env_name
        _params['replay_strategy'] = replay_strategy
        _params['ss'] = ss
        if action_l2 is not None:
            _params['action_l2'] = action_l2
        if not active:
            _params["buffer_size"] = 1
        if env_name in config.DEFAULT_ENV_PARAMS:
            _params.update(config.DEFAULT_ENV_PARAMS[env_name]
                           )  # merge env-specific parameters in
        _params.update(
            **_override_params)  # makes it possible to override any parameter
        with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
            json.dump(_params, f)
        _params = config.prepare_params(_params)
        _params['rollout_batch_size'] = _env.num_envs

        if demo_file is not None:
            _params['bc_loss'] = 1
        _params.update(_kwargs)

        config.log_params(_params, logger=logger)

        if num_cpu == 1:
            logger.warn()
            logger.warn('*** Warning ***')
            logger.warn(
                'You are running HER with just a single MPI worker. This will work, but the '
                +
                'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
                +
                'were obtained with --num_cpu 19. This makes a significant difference and if you '
                +
                'are looking to reproduce those results, be aware of this. Please also refer to '
                +
                'https://github.com/openai/baselines/issues/314 for further details.'
            )
            logger.warn('****************')
            logger.warn()

        dims, coord_dict = config.configure_dims(_params)
        _params['ddpg_params']['scope'] = scope
        policy, reward_fun = config.configure_ddpg(dims=dims,
                                                   params=_params,
                                                   active=active,
                                                   clip_return=clip_return)
        if load_path is not None:
            tf_util.load_variables(load_path)
            print(f"Loaded model: {load_path}")

        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'exploration': exploration
        }

        eval_params = {
            'exploit': True,
            'use_target_net': _params['test_with_polyak'],
            'use_demo_states': False,
            'compute_Q': True,
        }

        for name in [
                'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
        ]:
            rollout_params[name] = _params[name]
            eval_params[name] = _params[name]

        eval_env = eval_env or _env

        rollout_worker = RolloutWorker(_env,
                                       policy,
                                       dims,
                                       logger,
                                       active,
                                       monitor=True,
                                       **rollout_params)
        evaluator = RolloutWorker(eval_env, policy, dims, logger, active,
                                  **eval_params)

        return policy, rollout_worker, evaluator, _params, coord_dict, reward_fun

    active = kwargs["mode"] in ["basic", "exploration_module"]
    policy, rollout_worker, evaluator, params, *_ = prepare_agent(
        env, eval_env, active=active, scope="main")

    n_cycles = params['n_cycles']
    ##############################################################################
    # Maximum Coverage Agent
    mca_active = kwargs["mode"] in ["exploration_module", "maximum_span"]
    mca_load_path = set_default_value(kwargs, 'mca_load_path', None)
    mca_exploration = set_default_value(kwargs, 'mca_exploration',
                                        'eps_greedy')
    mca_action_l2 = set_default_value(kwargs, 'mca_action_l2', 1)
    ss = set_default_value(kwargs, 'ss', False)
    trainable = set_default_value(kwargs, 'trainable', True)
    random_cover = set_default_value(kwargs, 'random_cover', False)
    semi_metric = set_default_value(kwargs, 'semi_metric', False)
    k = set_default_value(kwargs, 'k', 1000)
    feature_w = set_default_value(params, 'feature_w', None)
    invalidate_episodes = set_default_value(kwargs, 'invalidate_episodes',
                                            False)
    alpha = set_default_value(kwargs, 'alpha', 0.5)
    nscrb_updates = set_default_value(kwargs, 'nscrb_updates', 1000)

    mca_policy, mca_rw, mca_evaluator, mca_params, coord_dict, reward_fun = prepare_agent(
        mca_env,
        eval_env,
        active=mca_active,
        exploration=mca_exploration,
        action_l2=mca_action_l2,
        scope="mca",
        ss=ss,
        load_path=mca_load_path)

    if semi_metric:
        ncells = rollout_worker.T
    else:
        ncells = 1

    state_model_vec = []
    for cidx in range(ncells):
        state_model_vec.append(
            MetricDiversifier(
                k=k,
                reward_func=reward_fun,
                vis=False,
                feature_w=feature_w,
                vis_coords=coord_dict['vis'],
                load_model=kwargs['load_mca_path'],
                save_path=f"{log_path}/{cidx}/mca_cover",
                random_cover=random_cover,
                load_p=1,
            ))

    mca = MCA(policy=mca_policy,
              semi_metric=semi_metric,
              rollout_worker=mca_rw,
              evaluator=mca_evaluator,
              state_model=state_model_vec,
              coord_dict=coord_dict,
              active=(alpha > 0))
    ##############################################################################

    if 'n_epochs' not in kwargs:
        n_epochs = total_timesteps // n_cycles // rollout_worker.T // mca_rw.rollout_batch_size
    else:
        n_epochs = int(kwargs['n_epochs'])

    return train(save_path=log_path,
                 policy=policy,
                 rollout_worker=rollout_worker,
                 evaluator=evaluator,
                 n_epochs=n_epochs,
                 n_test_rollouts=params['n_test_rollouts'],
                 n_cycles=params['n_cycles'],
                 n_batches=params['n_batches'],
                 policy_save_interval=policy_save_interval,
                 demo_file=demo_file,
                 mca=mca,
                 random_cover=random_cover,
                 trainable=trainable,
                 cover_measure_env=kwargs['cover_measure_env'],
                 invalidate_episodes=invalidate_episodes,
                 alpha=alpha,
                 nscrb_updates=nscrb_updates)