Beispiel #1
0
def test_params(comm: MPI.Comm, n: int, policy: Policy, nt: NoiseTable, gen_obstat: ObStat,
                fit_fn: Callable[[Module], TrainingResult], rs: RandomState) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
    """
    Tests `n` different perturbations of `policy`'s params and returns the positive and negative results
    (from all processes).

    Where positive_result[i] is the fitness when the noise at nt[noise_inds[i]] is added to policy.flat_params
    and negative_result[i] is when the same noise is subtracted

    :returns: tuple(positive results, negative results, noise inds, total steps)
    """
    results_pos, results_neg, inds = [], [], []
    for _ in range(n):
        idx, noise = nt.sample(rs)
        inds.append(idx)
        # for each noise ind sampled, both add and subtract the noise
        results_pos.append(fit_fn(policy.pheno(noise)))
        results_neg.append(fit_fn(policy.pheno(-noise)))
        gen_obstat.inc(*results_pos[-1].ob_sum_sq_cnt)
        gen_obstat.inc(*results_neg[-1].ob_sum_sq_cnt)

    n_objectives = len(results_pos[0].result)
    results = _share_results(comm, [tr.result for tr in results_pos],
                             [tr.result for tr in results_neg], inds)
    gen_obstat.mpi_inc(comm)
    steps = comm.allreduce(sum([tr.steps for tr in results_pos + results_neg]),
                           op=MPI.SUM)

    return results[:,
                   0:n_objectives], results[:, n_objectives:2 *
                                            n_objectives], results[:,
                                                                   -1], steps
Beispiel #2
0
def test_create_shared(comm):
    size = 5
    seed = 1
    noise = np.array(NoiseTable.create_shared(comm, size, 0, seed=seed).noise)

    all_noise = comm.alltoall([noise] * comm.size)
    assert np.isclose(all_noise, all_noise[0]).all()
    assert np.isclose(
        all_noise[0],
        np.random.RandomState(seed).randn(size).astype(np.float32)).all()
Beispiel #3
0
def batch_noise(inds: np.ndarray, nt: NoiseTable, policy_len: int, batch_size: int):
    """Need to batch noise otherwise will have to `dot` a large array"""
    assert inds.ndim == 1

    batch = []
    for idx in inds:
        batch.append(nt.get(int(idx), policy_len))
        if len(batch) == batch_size:
            yield np.array(batch)
            del batch[:]

    if batch:
        yield np.array(batch)
Beispiel #4
0
def test_batch_noise():
    table_size = 100
    params = 50
    nt = NoiseTable(params, np.arange(table_size))
    inds = np.arange(40)

    expected = [[i + j for j in range(params)] for i in range(len(inds))]

    full_batch = next(batch_noise(inds, nt, len(inds)))
    assert (full_batch == expected).all()

    batched = batch_noise(inds, nt, 19)
    assert (next(batched) == expected[:19]).all()
    assert (next(batched) == expected[19:38]).all()
    assert (next(batched) == expected[38:]).all()
Beispiel #5
0
def test_scale_noise():
    evals = 100
    params = 500
    table_size = 2000

    fits = np.arange(evals)
    inds = np.arange(evals)
    nt = NoiseTable(params, np.arange(table_size))

    scaled_batched = scale_noise(fits, inds, nt, 3)
    scaled_full = scale_noise(fits, inds, nt, evals)

    expected_noise = [[i + j for j in range(params)] for i in range(len(inds))]
    expected = np.dot(fits, expected_noise)

    assert (scaled_batched == expected).all()
    assert (scaled_full == expected).all()
Beispiel #6
0
    env: gym.Env = gym.make(cfg.env.name)

    # seeding; this must be done before creating the neural network so that params are deterministic across processes
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] *
        comm.size)  # simply for saving/viewing the seeds used on each proc
    print(f'seeds:{all_seeds}')

    # initializing obstat, policy, optimizer, noise and ranker
    nn = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                     cfg.policy.ac_std, cfg.policy.ob_clip)
    policy: Policy = Policy(nn, cfg.noise.std,
                            Adam(len(Policy.get_flat(nn)), cfg.policy.lr))
    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(policy), None,
                                              cfg.general.seed)
    ranker = CenteredRanker()

    def r_fn(model: torch.nn.Module) -> TrainingResult:
        save_obs = (rs.random() if rs is not None else
                    np.random.random()) < cfg.policy.save_obs_chance
        rews, behv, obs, steps = gym_runner.run_model(model, env, 10000, rs)
        return RewardResult(
            rews, behv, obs if save_obs else np.array(
                [np.zeros(env.observation_space.shape)]), steps)

    assert cfg.general.policies_per_gen % comm.size == 0 and (
        cfg.general.policies_per_gen / comm.size) % 2 == 0
    eps_per_proc = int((cfg.general.policies_per_gen / comm.size) / 2)
    for gen in range(cfg.general.gens):  # main loop
Beispiel #7
0
def main(cfg: Munch):
    full_name = f'{cfg.env.name}-{cfg.general.name}'
    comm: MPI.Comm = MPI.COMM_WORLD
    env: gym.Env = gym.make(cfg.env.name)

    mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None
    reporter = DefaultMpiReporterSet(comm, full_name,
                                     LoggerReporter(comm, full_name),
                                     StdoutReporter(comm), mlflow_reporter)

    # seeding
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] * comm.size)  # simply for saving the seeds used on each proc
    reporter.print(f'seeds:{all_seeds}')

    if cfg.nsr.adaptive:
        reporter.print("NSRA")
    elif cfg.nsr.progressive:
        reporter.print("P-NSRA")

    archive: Optional[np.ndarray] = None

    def ns_fn(model: torch.nn.Module, use_ac_noise=True) -> NSRResult:
        """Reward function"""
        save_obs = rs.random() < cfg.policy.save_obs_chance
        rews, behv, obs, steps = gym_runner.run_model(
            model, env, cfg.env.max_steps, rs if use_ac_noise else None)
        return NSRResult(
            rews, behv[-3:], obs
            if save_obs else np.array([np.zeros(env.observation_space.shape)]),
            steps, archive, cfg.novelty.k)

    # init population
    population = []
    nns = []
    for _ in range(cfg.general.n_policies):
        nns.append(
            FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                        cfg.policy.ac_std, cfg.policy.ob_clip))
        population.append(
            Policy(nns[-1], cfg.noise.std,
                   Adam(len(Policy.get_flat(nns[-1])), cfg.policy.lr)))
    # init optimizer and noise table
    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(population[0]), reporter,
                                              cfg.general.seed)
    policies_best_rewards = [-np.inf] * cfg.general.n_policies
    time_since_best = [0 for _ in range(cfg.general.n_policies)
                       ]  # TODO should this be per individual?
    obj_weight = [cfg.nsr.initial_w for _ in range(cfg.general.n_policies)]

    best_rew = -np.inf
    best_dist = -np.inf

    archive, policies_novelties = init_archive(comm, cfg, population, ns_fn)

    for gen in range(cfg.general.gens):  # main loop
        # picking the policy from the population
        idx = random.choices(list(range(len(policies_novelties))),
                             weights=policies_novelties,
                             k=1)[0]
        if cfg.nsr.progressive: idx = gen % cfg.general.n_policies
        idx = comm.scatter([idx] * comm.size)
        ranker = MultiObjectiveRanker(CenteredRanker(), obj_weight[idx])
        # reporting
        if cfg.general.mlflow: mlflow_reporter.set_active_run(idx)
        reporter.start_gen()
        reporter.log({'idx': idx})
        reporter.log({'w': obj_weight[idx]})
        reporter.log({'time since best': time_since_best[idx]})
        # running es
        tr, gen_obstat = es.step(cfg, comm, population[idx], nt, env, ns_fn,
                                 rs, ranker, reporter)
        for policy in population:
            policy.update_obstat(gen_obstat)  # shared obstat

        tr = comm.scatter([tr] * comm.size)  # sharing result
        # updating the weighting for choosing the next policy to be evaluated
        behv = comm.scatter(
            [mean_behv(population[idx], ns_fn, cfg.novelty.rollouts)] *
            comm.size)
        nov = comm.scatter([novelty(behv, archive, cfg.novelty.k)] * comm.size)
        archive = update_archive(
            comm, behv, archive)  # adding new behaviour and sharing archive
        policies_novelties[idx] = nov

        dist = np.linalg.norm(np.array(tr.positions[-3:-1]))
        rew = tr.reward

        if cfg.nsr.adaptive:
            obj_weight[idx], policies_best_rewards[idx], time_since_best[
                idx] = nsra(cfg, rew, obj_weight[idx],
                            policies_best_rewards[idx], time_since_best[idx])
        elif cfg.nsr.progressive:
            obj_weight[
                idx] = 1 if gen > cfg.nsr.end_progression_gen else gen / cfg.nsr.end_progression_gen

        # Saving policy if it obtained a better reward or distance
        if (rew > best_rew or dist > best_dist) and comm.rank == 0:
            best_rew = max(rew, best_rew)
            best_dist = max(dist, best_dist)

            # Only need to save the archive, policy is saved by DefaultMpiReportedSet
            archive_path = path.join('saved', full_name, 'archives')
            if not path.exists(archive_path):
                os.makedirs(archive_path)
            np.save(path.join(archive_path, f'{gen}.np'), archive)

        reporter.end_gen()

    mlflow.end_run()  # ending the outer mlflow run
Beispiel #8
0
def main(cfg):
    comm: MPI.Comm = MPI.COMM_WORLD

    full_name = f'{cfg.env.name}-{cfg.general.name}'
    mlflow_reporter = MLFlowReporter(comm, cfg) if cfg.general.mlflow else None
    reporter = DefaultMpiReporterSet(comm, full_name,
                                     LoggerReporter(comm, full_name),
                                     StdoutReporter(comm), mlflow_reporter)

    env: gym.Env = gym.make(cfg.env.name)

    # seeding
    rs, my_seed, global_seed = utils.seed(comm, cfg.general.seed, env)
    all_seeds = comm.alltoall(
        [my_seed] * comm.size)  # simply for saving the seeds used on each proc
    reporter.print(f'seeds:{all_seeds}')

    # initializing policy, optimizer, noise and env
    if 'load' in cfg.policy:
        policy: Policy = Policy.load(cfg.policy.load)
        nn: BaseNet = policy._module
    else:
        nn: BaseNet = FeedForward(cfg.policy.layer_sizes, torch.nn.Tanh(), env,
                                  cfg.policy.ac_std, cfg.policy.ob_clip)
        policy: Policy = Policy(nn, cfg.noise.std,
                                Adam(len(Policy.get_flat(nn)), cfg.policy.lr))

    nt: NoiseTable = NoiseTable.create_shared(comm, cfg.noise.tbl_size,
                                              len(policy), reporter,
                                              global_seed)

    ranker = CenteredRanker()
    if 0 < cfg.experimental.elite < 1:
        ranker = EliteRanker(CenteredRanker(), cfg.experimental.elite)

    best_max_rew = -np.inf  # highest achieved in any gen

    def r_fn(model: torch.nn.Module, use_ac_noise=True) -> TrainingResult:
        save_obs = rs.random() < cfg.policy.save_obs_chance
        rews = np.zeros(cfg.env.max_steps)
        for _ in range(max(1, cfg.general.eps_per_policy)):
            rew, behv, obs, steps = gym_runner.run_model(
                model, env, cfg.env.max_steps, rs if use_ac_noise else None)
            rews[:len(rew)] += np.array(rew)

        rews /= max(1, cfg.general.eps_per_policy)
        return RewardResult(
            rews.tolist(), behv, obs if save_obs else np.array(
                [np.zeros(env.observation_space.shape)]), steps)

    time_since_best = 0
    noise_std_inc = 0.08
    for gen in range(cfg.general.gens):
        if cfg.general.mlflow: mlflow_reporter.set_active_run(0)
        reporter.start_gen()

        if cfg.noise.std_decay != 1:
            reporter.log({'noise std': policy.std})
        if cfg.policy.lr_decay != 1:
            reporter.log({'lr': policy.optim.lr})
        if cfg.policy.ac_std_decay != 1:
            reporter.log({'ac std': nn._action_std})

        tr, gen_obstat = es.step(cfg, comm, policy, nt, env, r_fn, rs, ranker,
                                 reporter)
        policy.update_obstat(gen_obstat)

        cfg.policy.ac_std = nn._action_std = nn._action_std * cfg.policy.ac_std_decay
        cfg.noise.std = policy.std = max(cfg.noise.std * cfg.noise.std_decay,
                                         cfg.noise.std_limit)
        cfg.policy.lr = policy.optim.lr = max(
            cfg.policy.lr * cfg.policy.lr_decay, cfg.policy.lr_limit)

        reporter.log({'obs recorded': policy.obstat.count})

        max_rew_ind = np.argmax(ranker.fits[:, 0])
        max_rew = ranker.fits[:, 0][max_rew_ind]

        time_since_best = 0 if max_rew > best_max_rew else time_since_best + 1
        reporter.log({'time since best': time_since_best})
        # increasing noise std if policy is stuck
        if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.explore_with_large_noise:
            cfg.noise.std = policy.std = policy.std + noise_std_inc

        if 0 < cfg.experimental.elite < 1:  # using elite extension
            if time_since_best > cfg.experimental.max_time_since_best and cfg.experimental.elite < 1:
                ranker.elite_percent = cfg.experimental.elite
            if time_since_best == 0:
                ranker.elite_percent = 1
            reporter.print(f'elite percent: {ranker.elite_percent}')

        # Saving max rew if it obtained best ever rew
        if max_rew > best_max_rew and comm.rank == 0:
            best_max_rew = max_rew
            coeff = 1 if max_rew_ind < ranker.n_fits_ranked // 2 else -1  # checking if pos or neg noise ind used
            # TODO save this as a policy
            torch.save(
                policy.pheno(coeff *
                             ranker.noise_inds[max_rew_ind %
                                               (ranker.n_fits_ranked // 2)]),
                path.join('saved', full_name, 'weights',
                          f'gen{gen}-rew{best_max_rew:0.0f}.pt'))
            reporter.print(f'saving max policy with rew:{best_max_rew:0.2f}')

        reporter.end_gen()
    mlflow.end_run(
    )  # in the case where mlflow is the reporter, just ending its run