def learn_cartpole(config_path): env = gym.make('CartPole-v0') env._max_episode_steps = max_evaluation_steps # Load configuration. config = Config(StateMachineGenome, DefaultReproduction, DefaultSpeciesSet, DefaultStagnation, config_path) # Create the population, which is the top-level object for a NEAT run. p = Population(config) # Add a stdout reporter to show progress in the terminal. p.add_reporter(StdOutReporter(True)) stats = StatisticsReporter() p.add_reporter(stats) # Run for up for the given number of generations f = lambda genomes, config: eval_genomes(genomes, config, env=env) winner = p.run(f, num_generations) input("Press Enter to continue...") net = StateMachineNetwork.create(winner, config) eval_network(net, env, True)
class NEATRunner(BaseTrainer): def __init__( self, config: Config, evaluator: GymEvaluator, reporters: Optional[Sequence[BaseReporter]] = None, num_workers: Optional[int] = multiprocessing.cpu_count(), ): self._evaluator = evaluator self._population = Population(config) reporters = reporters or [] for reporter in reporters: self._population.add_reporter(reporter) self._num_workers = num_workers def _train(self, num_frames: Optional[int], stop_time: Optional[int]) -> DefaultGenome: if self._num_workers is None: func = lambda g, c: self._evaluate_population_fitness( g, c, num_frames) else: parallel = ParallelEvaluator( num_workers=self._num_workers, evaluator=self._evaluator, max_num_frames=num_frames or float('inf'), ) func = parallel.evaluate if stop_time is not None: # it may not be 100% reliable but it's the best we can achieve without writing a custom # parallel executor func = _timeout_func(func, time(), stop_time) try: return self._population.run( fitness_function=func, n=float('inf'), ) except TimeoutError: return self._population.best_genome def _evaluate_population_fitness( self, genomes: Sequence[Tuple[int, DefaultGenome]], config: Config, max_num_frames: int, ): if self._evaluator.num_frames >= max_num_frames: raise TimeoutError() for _, genome in genomes: genome.fitness, num_frames = self._evaluator.evaluate( genome, config) self._evaluator.num_frames += num_frames
def test_evolution(self): p = Population(10, 3, 1) status = p.get_status() for s in status.keys(): output = [] for i in range(status.get(s, 0)): output.append(p.run(s, i, [1, 2, 3])) p.set_score(s, i, 1) p.evolve()
def test_flow(self): p = Population(10, 3, 1) for _ in range(100): status = p.get_status() for s in status.keys(): output = [] for i in range(status.get(s, 0)): out = p.run(s, i, [1, 2, 3])[0] output.append(out) p.set_score(s, i, random.randrange(1, 10)) print(s, output) p.evolve() # print a sample pr = Printer(p.population[next(iter(p.population))][0]) pr.print()
def train(): env = gym.make('LunarLanderContinuous-v2') try: p = Population.load(FULLNAME) print('Existing state loaded') except FileNotFoundError as e: print(str(e) + '. Creating new state') p = Population(10000, env.observation_space.shape[0], env.action_space.shape[0]) while True: try: max_reward = -99999 status = p.get_status() for s in status.keys(): for i in range(status.get(s, 0)): ob = env.reset() reward_sum = 0 while True: action = action_final_activation(p.run(s, i, ob)) ob, reward, done, info = env.step(action) reward_sum = reward_sum + reward if done: break p.set_score(s, i, reward_sum) max_reward = np.max([reward_sum, max_reward]) print(p.generation, max_reward, p.population.keys()) try: p.save(AUTOSAVE) except RuntimeError as e: print('error saving: {}'.format(str(e))) p.evolve() except KeyboardInterrupt as e: try: print('\nsaving before exit') p.save(FULLNAME) sys.exit('Bye!') except RuntimeError as e: print('error saving: {}'.format(str(e)))
passing_score = 500 p = Population(1000, 2, 3) target_reward = 0.5 max_reward = -999999 winner = None max_position = -1.2 min_position = 0.6 while True: status = p.get_status() for s in status.keys(): output = [] for i in range(status.get(s, 0)): ob = env.reset() reward_sum = 200 while True: action = p.run(s, i, ob) ob, reward, done, info = env.step(np.argmax(action)) max_position = np.max([max_position, ob[0]]) min_position = np.min([min_position, ob[0]]) reward_sum = reward_sum + reward if done: break reward_sum = reward_sum + ((max_position + 1.2) - (min_position + 1.2)) max_reward = np.max([reward_sum, max_reward]) p.set_score(s, i, reward_sum) if max_position >= target_reward: winner = (s, i) break