def _run_trial_exploit(self, env, trials, current_trial) -> TrialMetrics: logger.debug("** Running trial exploit **") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() selected_cl = None prev_selected_cl = None done = False while not done: state = Perception(state) match_set = self.population.form_match_set(state) selected_cl = self._best_cl(match_set) action = selected_cl.action iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) steps += 1 return TrialMetrics(steps, last_reward)
def _run_trial_explore(self, env, trials, current_trial) -> TrialMetrics: logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() selected_cl = None prev_selected_cl = None done = False while not done: state = Perception(state) match_set = self.population.form_match_set(state) if steps > 0: alp.apply(prev_state, state, selected_cl, self.population) rl.bucket_brigade_update( selected_cl, prev_selected_cl, last_reward) prev_selected_cl = selected_cl # TODO: you can do it better if random.random() < self.cfg.epsilon: selected_cl = random.choice(match_set) else: selected_cl = self._best_cl(match_set) action = selected_cl.action iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) prev_state = Perception(state) raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) if done: alp.apply(prev_state, state, selected_cl, self.population) rl.bucket_brigade_update( selected_cl, prev_selected_cl, last_reward) steps += 1 return TrialMetrics(steps, last_reward)
def _run_trial_explore(self, env, time, current_trial=None) \ -> TrialMetrics: logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() action_set = ClassifiersList() done = False while not done: if self.cfg.do_action_planning and \ self._time_for_action_planning(steps + time): # Action Planning for increased model learning steps_ap, state, prev_state, action_set, \ action, last_reward = \ self._run_action_planning(env, steps + time, state, prev_state, action_set, action, last_reward) steps += steps_ap state = Perception(state) match_set = self.population.form_match_set(state) if steps > 0: # Apply learning in the last action set ClassifiersList.apply_alp( self.population, match_set, action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) ClassifiersList.apply_reinforcement_learning( action_set, last_reward, match_set.get_maximum_fitness(), self.cfg.beta, self.cfg.gamma ) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, match_set, action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) action = choose_action( match_set, self.cfg.number_of_possible_actions, self.cfg.epsilon, self.cfg.biased_exploration ) iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) action_set = match_set.form_action_set(action) prev_state = Perception(state) raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) if done: ClassifiersList.apply_alp( self.population, ClassifiersList(), action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) ClassifiersList.apply_reinforcement_learning( action_set, last_reward, 0, self.cfg.beta, self.cfg.gamma) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, ClassifiersList(), action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) steps += 1 return TrialMetrics(steps, last_reward)
def test_should_create_empty_perception(self): # when p = Perception.empty() # then assert p is not None
def _run_trial_explore(self, env, time, current_trial=None) \ -> TrialMetrics: logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() last_reward = 0 prev_state = Perception.empty() action_set = ClassifiersList() done = False prev_M_best_fitness = 0 was_greedy = False while not done: state = Perception(state) match_set = self.population.form_match_set(state) if steps > 0: # Apply learning in the last action set ClassifiersList.apply_alp( self.population, match_set, action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) self.apply_reinforcement_learning( action_set, last_reward, prev_M_best_fitness, match_set.get_maximum_fitness(), was_greedy) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, match_set, action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) action, was_greedy = self._epsilon_greedy(match_set) iaction = self.cfg.environment_adapter.to_lcs_action(action) logger.debug("\tExecuting action: [%d]", action) action_set = match_set.form_action_set(action) prev_state = Perception(state) prev_M_best_fitness = match_set.get_maximum_fitness() raw_state, last_reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) state = Perception(state) if done: ClassifiersList.apply_alp( self.population, ClassifiersList(), action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) self.apply_reinforcement_learning( action_set, last_reward, prev_M_best_fitness, 0, was_greedy) if self.cfg.do_ga: ClassifiersList.apply_ga( time + steps, self.population, ClassifiersList(), action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) steps += 1 return TrialMetrics(steps, last_reward)
def _run_trial_explore(self, env, time, current_trial=None) \ -> TrialMetrics: """ Executes explore trial Parameters ---------- env time Returns ------- Tuple[int, int] Tuple of total steps taken and final reward """ logger.debug("** Running trial explore ** ") # Initial conditions steps = 0 raw_state = env.reset() state = self.cfg.environment_adapter.to_genotype(raw_state) action = env.action_space.sample() reward = 0 prev_state = Perception.empty() action_set = ClassifierList() done = False while not done: match_set = self.population.form_match_set(state) if steps > 0: # Apply learning in the last action set ClassifierList.apply_alp(self.population, match_set, action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) ClassifierList.apply_reinforcement_learning( action_set, reward, match_set.get_maximum_fitness(), self.cfg.beta, self.cfg.gamma) if self.cfg.do_ga: ClassifierList.apply_ga(time + steps, self.population, match_set, action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) action = choose_action(match_set, self.cfg.number_of_possible_actions, self.cfg.epsilon, self.cfg.biased_exploration) logger.debug("\tExecuting action: [%d]", action) action_set = match_set.form_action_set(action) prev_state = state iaction = self.cfg.environment_adapter.to_lcs_action(action) raw_state, reward, done, _ = env.step(iaction) state = self.cfg.environment_adapter.to_genotype(raw_state) if done: ClassifierList.apply_alp(self.population, ClassifierList(), action_set, prev_state, action, state, time + steps, self.cfg.theta_exp, self.cfg) ClassifierList.apply_reinforcement_learning( action_set, reward, 0, self.cfg.beta, self.cfg.gamma) if self.cfg.do_ga: ClassifierList.apply_ga(time + steps, self.population, match_set, action_set, state, self.cfg.theta_ga, self.cfg.mu, self.cfg.chi, self.cfg.theta_as, self.cfg.do_subsumption, self.cfg.theta_exp) steps += 1 return TrialMetrics(steps, reward)