Esempio n. 1
0
    def objective_two(
        individual: cgp.IndividualSingleGenome,
        cum_reward_threshold: int,
        gamma: float,
        seed: int,
        rng: np.random.Generator,
    ):
        if not individual.fitness_is_None():
            return individual

        # environment initialization
        env = gym.make('CartPole-v0')

        try:
            individual.fitness = inner_objective(
                individual,
                network=individual.network,
                env=env,
                cum_reward_threshold=cum_reward_threshold,
                gamma=gamma,
                seed=seed,
                rng=rng,
                mode='episode_min')
        except ZeroDivisionError:
            individual.fitness = -np.inf

        return individual
Esempio n. 2
0
def inner_objective(
    ind: cgp.IndividualSingleGenome,
    network_params: dict,
    curriculum_params: dict,
    seeds
) -> float:

    rule = ind.to_torch()

    reward_per_seed = []
    reward_per_seed_mean = []
    for seed in seeds:
        seed = int(seed)

        torch.manual_seed(seed=seed)
        rng = np.random.default_rng(seed=seed)

        # environment and network initialization
        env = DynamicMiniGrid(seed=seed)
        env = ImgObsWrapper(env)
        state = env.respawn()["image"][:,:,0].flatten()

        policy_net = Network(n_inputs=np.size(state), **network_params)

        rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng)

        reward_per_seed.append(rewards_over_alterations)
        reward_per_seed_mean.append(np.mean(rewards_over_alterations))

    ind.reward_matrix = reward_per_seed
    reward_mean = np.mean(reward_per_seed_mean)

    return float(reward_mean)
Esempio n. 3
0
def _create_individual(genome, fitness=None, individual_type="SingleGenome"):
    if individual_type == "SingleGenome":
        ind = IndividualSingleGenome(genome)
        if fitness is not None:
            ind.fitness = fitness
        return ind
    elif individual_type == "MultiGenome":
        ind = IndividualMultiGenome([genome])
        if fitness is not None:
            ind.fitness = fitness
        return ind
    else:
        raise NotImplementedError("Unknown individual type.")
Esempio n. 4
0
def objective(
        individual: cgp.IndividualSingleGenome,
        network_params: dict,
        curriculum_params: dict,
        seeds
):

    if not individual.fitness_is_None():
        return individual
    try:
        individual.fitness = inner_objective(individual, network_params, curriculum_params, seeds)
    except ZeroDivisionError:
        individual.fitness = -np.inf
    return individual
Esempio n. 5
0
    def objective_one(
        individual: cgp.IndividualSingleGenome,
        n_episodes: int,
        gamma: float,
        seed: int,
        rng: np.random.Generator,
    ):

        if not individual.fitness_is_None():
            return individual

        # environment initialization
        env = gym.make('CartPole-v0')
        env.seed(seed=seed)
        env.action_space.seed(seed)

        # network initialization
        torch.manual_seed(seed=seed)
        network = Network(n_inputs=env.observation_space.shape[0],
                          n_hidden=100,
                          n_outputs=env.action_space.n,
                          learning_rate=2e-4,
                          weight_update_mode='evolved_rule')

        try:
            individual.fitness = inner_objective(individual,
                                                 network=network,
                                                 env=env,
                                                 n_episodes=n_episodes,
                                                 gamma=gamma,
                                                 seed=seed,
                                                 rng=rng,
                                                 mode='reward_max')
        except ZeroDivisionError:
            individual.fitness = -np.inf

        # Todo write network.state_dict() to ind (and possibly pickle dumps)
        individual.network = network  # assign the trained network to the individual for objective 2
        return individual
Esempio n. 6
0
    def inner_objective(
        ind: cgp.IndividualSingleGenome,
        network: Network,
        env: gym.Env,
        seed: int,
        rng: np.random.Generator,
        mode: AnyStr,
        gamma: Optional[float] = 0.9,
        n_steps_per_run: Optional[int] = 200,
        n_episodes: Optional[int] = 0,
        cum_reward_threshold: Optional[int] = 0,
        n_episodes_reward_expectation: Optional[float] = 100,
    ) -> float:

        t = ind.to_torch()
        cum_reward = 0
        episode_counter = 0
        expected_cum_reward_per_episode = 0
        while episode_counter < n_episodes or cum_reward <= cum_reward_threshold:
            state = env.reset()
            el_traces = torch.zeros([
                network.output_layer.out_features,
                network.output_layer.in_features + 1
            ])
            discounted_reward = 0

            for _ in range(n_steps_per_run):
                action, probs, hidden_activities = network.get_action(
                    state, rng)

                hidden_activities = torch.cat(
                    (hidden_activities, torch.ones(1)), 0)
                log_prob = torch.log(probs.squeeze(0)[action])

                new_state, reward, done, _ = env.step(action)
                discounted_reward *= gamma
                discounted_reward += reward
                cum_reward += reward

                el_traces = update_el_traces(el_traces, probs,
                                             hidden_activities, action)

                update_weights_online_with_rule(
                    rule=t,
                    network=network,
                    reward=reward,
                    el_traces=el_traces,
                    log_prob=log_prob,
                    discounted_reward=discounted_reward,
                    done=done,
                    expected_cum_reward_per_episode=
                    expected_cum_reward_per_episode)

                if done:
                    episode_counter += 1
                    break
                state = new_state

            # todo: "document" variable
            expected_cum_reward_per_episode = (1-1/n_episodes_reward_expectation)*\
                                              expected_cum_reward_per_episode + \
                                              (1/n_episodes_reward_expectation)*cum_reward/n_steps_per_run
        env.close()
        if mode == 'reward_max':
            return float(cum_reward)
        elif mode == 'episode_min':
            return float(-episode_counter)
        else:
            raise AssertionError('Mode not available')
Esempio n. 7
0
def test_sort_with_hurdles():
    # one objective: fittest individual should be first
    ind0 = IndividualSingleGenome([])
    ind0._fitness = [0]
    ind1 = IndividualSingleGenome([])
    ind1._fitness = [1]
    ind2 = IndividualSingleGenome([])
    ind2._fitness = [2]
    ind3 = IndividualSingleGenome([])
    ind3._fitness = [3]
    individuals = [ind0, ind1, ind2, ind3]
    ea = cgp.ea.MuPlusLambda()
    sorted_individuals = ea._sort(individuals)
    expected = [[3], [2], [1], [0]]
    assert [ind._fitness for ind in sorted_individuals] == expected

    # two objective with hurdle: individuals which pass more hurdles should come
    # first, in each hurdle sorted by their fitness
    ind0 = IndividualSingleGenome([])
    ind0._fitness = [0, 5]
    ind1 = IndividualSingleGenome([])
    ind1._fitness = [1, None]
    ind2 = IndividualSingleGenome([])
    ind2._fitness = [2, 4]
    ind3 = IndividualSingleGenome([])
    ind3._fitness = [3, None]
    individuals = [ind0, ind1, ind2, ind3]
    ea = cgp.ea.MuPlusLambda()
    sorted_individuals = ea._sort(individuals)
    expected = [[0, 5], [2, 4], [3, None], [1, None]]
    assert [ind._fitness for ind in sorted_individuals] == expected