def get_q_value(self, state_as_dict: StateAsDict, action : Action) -> float: assert self.network is not None, 'You must create network first.' self.set_corrent_session() vector_of_state = vectorize_state(state_as_dict) vector_of_action = vectorize_action(action) input_vec = np.array(vector_of_state + vector_of_action) return self.network.predict(x=input_vec.reshape(1, 597))[0]
def choose_act(self, mode) -> Action: #first we load observation to the private environment current_points = self.env.current_state_of_the_game.active_players_hand( ).number_of_my_points() if len(self.env.action_space.list_of_actions) > 0: actions = [] potential_reward_max = self.action_to_avoid numerator = self.depth - 1 primary_state = StateAsDict(self.env.current_state_of_the_game) self.env_dict[numerator] = StateAsDict( self.env.current_state_of_the_game) for action in self.env.action_space.list_of_actions: ae = action.evaluate(self.env.current_state_of_the_game) potential_reward = (np.floor((current_points + ae["card"][2])/POINTS_TO_WIN) * self.weight[0] +\ self.weight[1] * ae["card"][2] + self.weight[2] *ae["nobles"] +\ self.weight[3] * ae["card"][0] + self.weight[4] * sum(ae["gems_flow"])) potential_reward -= self.decay * self.deep_evaluation( action, numerator - 1, mode) self.restore_env(numerator) if self.collect_stats: self.stats_dataframe = self.stats_dataframe.append( { 'state': primary_state, 'action': action.to_dict(), 'evaluation': potential_reward }, ignore_index=True) self.stats_dataframe_vectorized = self.stats_dataframe_vectorized.append( { 'state_vector': vectorize_state(primary_state), 'action_vector': vectorize_action(action), 'evaluation': potential_reward }, ignore_index=True) if potential_reward > potential_reward_max: potential_reward_max = potential_reward actions = [] actions.append(action) elif potential_reward == potential_reward_max: actions.append(action) self.env.reset() self.env.load_state_from_dict(self.env_dict[numerator]) return random.choice(actions) else: return None
def evaluate_list(self, state_as_dict : StateAsDict, list_of_actions: List[Action]): assert self.network is not None, 'You must create network first.' self.set_corrent_session() X = [] if len(list_of_actions) > 0: vector_of_state = vectorize_state(state_as_dict) for action in list_of_actions: state_action_concat = vector_of_state + vectorize_action(action) X.append(state_action_concat) X = np.array(X) q_values_predicted = self.network.predict(X) return q_values_predicted else: return None
def run_one_game_and_collect_data(self, debug_info=True): last_value_player_0 = None last_value_player_1 = None old_value = None self.env.reset() observation = self.env.show_observation('deterministic') is_done = False number_of_moves = 0 debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value')) collected_data = pd.DataFrame(columns=('state_as_vector', 'value')) extra_move_done = False while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES: if is_done: extra_move_done = True action, best_value = self.agent.choose_action(observation, [None], info=True) #print('best value = {}'.format(best_value)) observation, reward, is_done, info = self.env.step('deterministic', action) previous_player_id = self.env.previous_player_id() winner_id = info['winner_id'] current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) if previous_player_id == 0: old_value = last_value_player_0 if previous_player_id == 1: old_value = last_value_player_1 if debug_info: debug_collected_data = debug_collected_data.append({ 'new_value': self.new_value_formula(old_value, best_value, winner_id, reward, alpha=0.1), 'active_player_id' : self.env.previous_player_id(), 'winner_id' : winner_id, 'reward' : reward, 'best_value' : best_value, 'old_value': old_value, 'pa_points' : self.env.previous_players_hand().number_of_my_points()}, ignore_index=True) collected_data = collected_data.append({'state_as_vector' : vectorize_state(current_state_as_dict), 'value': self.new_value_formula(old_value, best_value, winner_id, reward, alpha=0.1)}, ignore_index=True) if previous_player_id == 0: last_value_player_0 = best_value if previous_player_id == 1: last_value_player_1 = best_value #let the opponent move: number_of_moves += 1 if debug_info: debug_collected_data.to_csv('debug_info.csv') collected_data = collected_data.iloc[2:] return collected_data
def run_one_game_and_collect_data(self, debug_info=True): there_was_no_action = False self.agent.train_mode() last_actual_player_0 = None last_actual_player_1 = None last_state_player_0 = None last_state_player_1 = None last_action_vec_player_0 = None last_action_vec_player_1 = None old_value = None old_state = None old_action_vec = None self.env.reset() observation = self.env.show_observation('deterministic') is_done = False number_of_moves = 0 debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value')) collected_data = pd.DataFrame(columns=('state_as_vector', 'value')) extra_move_done = False while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES: if is_done: extra_move_done = True current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) actual_action, actual_eval, best_eval = self.agent.choose_action(observation, [None]) if actual_action is None: there_was_no_action = True break #print('best value = {}'.format(best_value)) observation, reward, is_done, info = self.env.step('deterministic', actual_action) previous_player_id = self.env.previous_player_id() winner_id = info['winner_id'] if previous_player_id == 0: old_value = last_actual_player_0 old_state = last_state_player_0 old_action_vec = last_action_vec_player_0 if previous_player_id == 1: old_value = last_actual_player_1 old_state = last_state_player_1 old_action_vec = last_action_vec_player_1 if debug_info: state_status = old_state.__repr__() if old_state is not None else 'NONE' state_vector = vectorize_state(old_state) if old_state is not None else 'NONE' debug_collected_data = debug_collected_data.append({ 'state_ex' : state_status, 'state_vec' : state_vector, 'new_value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha), 'active_player_id' : self.env.previous_player_id(), 'winner_id' : winner_id, 'reward' : reward, 'best_eval' : best_eval, 'actual_eval' : actual_eval, 'old_value': old_value, 'pa_points' : self.env.previous_players_hand().number_of_my_points()}, ignore_index=True) if old_state is not None: collected_data = collected_data.append({'state_as_vector' : vectorize_state(old_state), 'action_vector' : old_action_vec, 'value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha)}, ignore_index=True) if previous_player_id == 0: last_actual_player_0 = actual_eval last_state_player_0 = current_state_as_dict last_action_vec_player_0 = vectorize_action(actual_action) if previous_player_id == 1: last_actual_player_1 = actual_eval last_state_player_1 = current_state_as_dict last_action_vec_player_1 = vectorize_action(actual_action) #let the opponent move: number_of_moves += 1 if debug_info: debug_collected_data.to_csv('debug_info.csv') collected_data = collected_data.iloc[0:] self.agent.test_mode() return collected_data