def _run_trial_explore(self, env, time, current_trial=None) \ -> TrialMetrics: logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() action_set = ClassifiersList() done = False prev_M_best_fitness = 0 was_greedy = False while not done: state = Perception(state) match_set = self.population.form_match_set(state) if steps > 0: # Apply learning in the last action set ClassifiersList.apply_alp( self.population, match_set, action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) self.apply_reinforcement_learning( action_set, last_reward, prev_M_best_fitness, match_set.get_maximum_fitness(), was_greedy) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, match_set, action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) action, was_greedy = self._epsilon_greedy(match_set) iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) action_set = match_set.form_action_set(action) prev_state = Perception(state) prev_M_best_fitness = match_set.get_maximum_fitness() raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) if done: ClassifiersList.apply_alp( self.population, ClassifiersList(), action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) self.apply_reinforcement_learning( action_set, last_reward, prev_M_best_fitness, 0, was_greedy) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, ClassifiersList(), action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) steps += 1 return TrialMetrics(steps, last_reward)
if __name__ == '__main__': # Play some games for g in range(GAMES): action_set = ClassifiersList(cfg=cfg) prev_state, action, reward, done = None, None, None, False state = board.reset() moves = 0 while not done: player = determine_player(moves) # Determine player match_set = ClassifiersList.form_match_set(population, state, cfg) if moves > 0: action_set.apply_alp(prev_state, action, state, ALL_MOVES + moves, population, match_set) action_set.apply_reinforcement_learning( reward, match_set.get_maximum_fitness()) if cfg.do_ga: action_set.apply_ga(ALL_MOVES + moves, population, match_set, state) # Determine best action action = match_set.choose_action(cfg.epsilon) action_set = ClassifiersList.form_action_set( match_set, action, cfg) prev_state = state state, reward, done, debug = board.step(action)