def objective_two( individual: cgp.IndividualSingleGenome, cum_reward_threshold: int, gamma: float, seed: int, rng: np.random.Generator, ): if not individual.fitness_is_None(): return individual # environment initialization env = gym.make('CartPole-v0') try: individual.fitness = inner_objective( individual, network=individual.network, env=env, cum_reward_threshold=cum_reward_threshold, gamma=gamma, seed=seed, rng=rng, mode='episode_min') except ZeroDivisionError: individual.fitness = -np.inf return individual
def inner_objective( ind: cgp.IndividualSingleGenome, network_params: dict, curriculum_params: dict, seeds ) -> float: rule = ind.to_torch() reward_per_seed = [] reward_per_seed_mean = [] for seed in seeds: seed = int(seed) torch.manual_seed(seed=seed) rng = np.random.default_rng(seed=seed) # environment and network initialization env = DynamicMiniGrid(seed=seed) env = ImgObsWrapper(env) state = env.respawn()["image"][:,:,0].flatten() policy_net = Network(n_inputs=np.size(state), **network_params) rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng) reward_per_seed.append(rewards_over_alterations) reward_per_seed_mean.append(np.mean(rewards_over_alterations)) ind.reward_matrix = reward_per_seed reward_mean = np.mean(reward_per_seed_mean) return float(reward_mean)
def _create_individual(genome, fitness=None, individual_type="SingleGenome"): if individual_type == "SingleGenome": ind = IndividualSingleGenome(genome) if fitness is not None: ind.fitness = fitness return ind elif individual_type == "MultiGenome": ind = IndividualMultiGenome([genome]) if fitness is not None: ind.fitness = fitness return ind else: raise NotImplementedError("Unknown individual type.")
def objective( individual: cgp.IndividualSingleGenome, network_params: dict, curriculum_params: dict, seeds ): if not individual.fitness_is_None(): return individual try: individual.fitness = inner_objective(individual, network_params, curriculum_params, seeds) except ZeroDivisionError: individual.fitness = -np.inf return individual
def objective_one( individual: cgp.IndividualSingleGenome, n_episodes: int, gamma: float, seed: int, rng: np.random.Generator, ): if not individual.fitness_is_None(): return individual # environment initialization env = gym.make('CartPole-v0') env.seed(seed=seed) env.action_space.seed(seed) # network initialization torch.manual_seed(seed=seed) network = Network(n_inputs=env.observation_space.shape[0], n_hidden=100, n_outputs=env.action_space.n, learning_rate=2e-4, weight_update_mode='evolved_rule') try: individual.fitness = inner_objective(individual, network=network, env=env, n_episodes=n_episodes, gamma=gamma, seed=seed, rng=rng, mode='reward_max') except ZeroDivisionError: individual.fitness = -np.inf # Todo write network.state_dict() to ind (and possibly pickle dumps) individual.network = network # assign the trained network to the individual for objective 2 return individual
def inner_objective( ind: cgp.IndividualSingleGenome, network: Network, env: gym.Env, seed: int, rng: np.random.Generator, mode: AnyStr, gamma: Optional[float] = 0.9, n_steps_per_run: Optional[int] = 200, n_episodes: Optional[int] = 0, cum_reward_threshold: Optional[int] = 0, n_episodes_reward_expectation: Optional[float] = 100, ) -> float: t = ind.to_torch() cum_reward = 0 episode_counter = 0 expected_cum_reward_per_episode = 0 while episode_counter < n_episodes or cum_reward <= cum_reward_threshold: state = env.reset() el_traces = torch.zeros([ network.output_layer.out_features, network.output_layer.in_features + 1 ]) discounted_reward = 0 for _ in range(n_steps_per_run): action, probs, hidden_activities = network.get_action( state, rng) hidden_activities = torch.cat( (hidden_activities, torch.ones(1)), 0) log_prob = torch.log(probs.squeeze(0)[action]) new_state, reward, done, _ = env.step(action) discounted_reward *= gamma discounted_reward += reward cum_reward += reward el_traces = update_el_traces(el_traces, probs, hidden_activities, action) update_weights_online_with_rule( rule=t, network=network, reward=reward, el_traces=el_traces, log_prob=log_prob, discounted_reward=discounted_reward, done=done, expected_cum_reward_per_episode= expected_cum_reward_per_episode) if done: episode_counter += 1 break state = new_state # todo: "document" variable expected_cum_reward_per_episode = (1-1/n_episodes_reward_expectation)*\ expected_cum_reward_per_episode + \ (1/n_episodes_reward_expectation)*cum_reward/n_steps_per_run env.close() if mode == 'reward_max': return float(cum_reward) elif mode == 'episode_min': return float(-episode_counter) else: raise AssertionError('Mode not available')
def test_sort_with_hurdles(): # one objective: fittest individual should be first ind0 = IndividualSingleGenome([]) ind0._fitness = [0] ind1 = IndividualSingleGenome([]) ind1._fitness = [1] ind2 = IndividualSingleGenome([]) ind2._fitness = [2] ind3 = IndividualSingleGenome([]) ind3._fitness = [3] individuals = [ind0, ind1, ind2, ind3] ea = cgp.ea.MuPlusLambda() sorted_individuals = ea._sort(individuals) expected = [[3], [2], [1], [0]] assert [ind._fitness for ind in sorted_individuals] == expected # two objective with hurdle: individuals which pass more hurdles should come # first, in each hurdle sorted by their fitness ind0 = IndividualSingleGenome([]) ind0._fitness = [0, 5] ind1 = IndividualSingleGenome([]) ind1._fitness = [1, None] ind2 = IndividualSingleGenome([]) ind2._fitness = [2, 4] ind3 = IndividualSingleGenome([]) ind3._fitness = [3, None] individuals = [ind0, ind1, ind2, ind3] ea = cgp.ea.MuPlusLambda() sorted_individuals = ea._sort(individuals) expected = [[0, 5], [2, 4], [3, None], [1, None]] assert [ind._fitness for ind in sorted_individuals] == expected