class StateEncoder(AbstractModel): def __init__(self, gems_encoder_dim : int = None, price_encoder_dim : int = None, profit_encoder_dim : int = None, cards_points_dim: int = None, cards_dense1_dim: int = None, cards_dense2_dim: int = None, board_nobles_dense1_dim : int = None, board_nobles_dense2_dim : int = None, full_board_dense1_dim: int = None, full_board_dense2_dim: int = None, player_points_dim: int = None, player_nobles_dim: int = None, full_player_dense1_dim: int = None, full_player_dense2_dim: int = None, final_layer= None, data_transformer = None, network_name: str = None ): super().__init__() self.vectorizer = Vectorizer() self.final_layer = final_layer self.data_transformer = data_transformer self.params['data transormation'] = self.data_transformer.name self.params['final layer name'] = self.final_layer.name self.params['gems_encoder_dim'] = gems_encoder_dim self.params['gems_encoder_dim'] = gems_encoder_dim self.params['price_encoder_dim'] = price_encoder_dim self.params['profit_encoder_dim'] = profit_encoder_dim self.params['cards_points_dim'] = cards_points_dim self.params['cards_dense1_dim'] = cards_dense1_dim self.params['cards_dense2_dim'] = cards_dense2_dim self.params['board_nobles_dense1_dim'] = board_nobles_dense1_dim self.params['board_nobles_dense2_dim'] = board_nobles_dense2_dim self.params['full_board_dense1_dim']= full_board_dense1_dim self.params['full_board_dense2_dim'] = full_board_dense2_dim self.params['player_points_dim'] = player_points_dim self.params['player_nobles_dim'] = player_nobles_dim self.params['full_player_dense1_dim'] = full_player_dense1_dim self.params['full_player_dense2_dim']= full_player_dense2_dim self.arena = Arena() self.network_agent = ValueNNAgent(self) self.easy_opp = RandomAgent(distribution='first_buy') self.medium_opp = GreedyAgentBoost() self.hard_opp = MinMaxAgent() self.neptune_monitor = NeptuneMonitor() self.network_name = network_name self.gems_encoder = GemsEncoder(gems_encoder_dim) self.price_encoder = PriceEncoder(price_encoder_dim) self.board_encoder = BoardEncoder(self.gems_encoder, ManyNoblesEncoder(price_encoder_dim, board_nobles_dense1_dim, board_nobles_dense2_dim), ManyCardsEncoder(MAX_CARDS_ON_BORD, profit_encoder_dim, price_encoder_dim, cards_points_dim, cards_dense1_dim, cards_dense2_dim ), full_board_dense1_dim, full_board_dense2_dim) self.player_encoder = PlayerEncoder(self.gems_encoder, self.price_encoder, ManyCardsEncoder(MAX_RESERVED_CARDS, profit_encoder_dim, price_encoder_dim, cards_points_dim, cards_dense1_dim, cards_dense2_dim ), player_points_dim, player_nobles_dim, full_player_dense1_dim, full_player_dense2_dim) active_player_input = PlayersInputGenerator('active_').inputs other_player_input = PlayersInputGenerator('other_').inputs board_input = self.board_encoder.inputs self.inputs = board_input + active_player_input + other_player_input board_encoded = self.board_encoder(board_input) active_player_encoded = self.player_encoder(active_player_input) other_player_encoded = self.player_encoder(other_player_input) full_state = Concatenate(axis=-1)([board_encoded, active_player_encoded, other_player_encoded]) full_state = Dense(full_player_dense1_dim, activation='relu')(full_state) final_state = Dense(full_player_dense2_dim, activation='relu')(full_state) result = self.final_layer(final_state) self.layer = Model(inputs = self.inputs, outputs = final_state, name = 'full_state_splendor_estimator') self.network = Model(inputs = self.inputs, outputs = result, name = 'full_state_splendor_estimator') self.network.compile(Adam(), loss='mean_squared_error') self.params['Model name'] = 'Average pooling model' self.params['optimizer_name'] = 'Adam' def get_value(self, state): prediciton = self.network.predict(self.vectorizer.state_to_input(state)) return self.final_layer.get_value(prediciton) def train_on_mcts_data(self, data_frame, train_epochs:int): X = data_frame['state'] Y = data_frame['mcts_value'] X = self.vectorizer.many_states_to_input(X) Y = self.data_transformer.transform_array(Y) fit_history = self.network.fit(X, Y, epochs=train_epochs) return fit_history def train_network_on_many_sets(self, train_dir=None, validation_file=None, epochs=None, batch_size=None, test_games=1): assert self.network is not None, 'You must create network before training' with open(validation_file, 'rb') as f: X_val, Y_val = pickle.load(f) X_val = self.vectorizer.many_states_to_input(X_val) Y_val = self.data_transformer.transform_array(Y_val) self.neptune_monitor.reset_epoch_counter() file1, file2 = self.gather_data_info(train_dir, validation_file) self.start_neptune_experiment(experiment_name=self.network_name, description='Training avg_pool arch network', neptune_monitor=self.neptune_monitor) self.neptune_monitor.log_histograms(file1, file2) files_for_training = os.listdir(train_dir) for epoch in range(epochs): print(f'\n Epoch {epoch}: \n') file_epoch = epoch % len(files_for_training) X, Y = load_data_for_model(os.path.join(train_dir, files_for_training[file_epoch])) X = self.vectorizer.many_states_to_input(X) Y = self.data_transformer.transform_array(Y) self.network.fit(x=X, y=Y, epochs=1, batch_size=batch_size, validation_data=(X_val, Y_val), callbacks=[self.neptune_monitor]) del X del Y neptune.stop() def dump_weights(self, file_name): self.network.save_weights(file_name) def load_weights(self, file_name): self.network.load_weights(file_name) def gather_data_info(self, train_dir, validation_file): list_of_files = os.listdir(train_dir) example_file = list_of_files[0] with open(os.path.join(train_dir, example_file), 'rb') as f1: _, Y_ex = pickle.load(f1) with open(validation_file, 'rb') as f2: _, Y_val = pickle.load(f2) self.params['train set size'] = len(Y_ex) self.params['valid set size'] = len(Y_val) file1 = os.path.join('temp', 'train_hist.png') file2 = os.path.join('temp', 'valid_hist.png') Y_ex = self.data_transformer.transform_array(Y_ex) Y_val = self.data_transformer.transform_array(Y_val) plt.hist(Y_ex, bins=100) plt.savefig(file1) plt.clf() plt.hist(Y_val, bins=100) plt.savefig(file2) return file1, file2 def check_performance(self, n_games, opponents): performance_results = {} if 'easy' in opponents: easy_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.easy_opp], n_games, shuffle_agents=True) _, _, easy_win_rate = easy_results.return_stats() performance_results['easy'] = easy_win_rate / n_games if 'medium' in opponents: medium_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.medium_opp], n_games, shuffle_agents=True) _, _, medium_win_rate = medium_results.return_stats() performance_results['medium'] = medium_win_rate / n_games if 'hard' in opponents: hard_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.hard_opp], n_games, shuffle_agents=True) _, _, hard_win_rate = hard_results.return_stats() performance_results['hard'] = hard_win_rate / n_games return performance_results def run_test(self, n_games): results = self.check_performance(n_games, ['easy']) self.neptune_monitor.log_win_rates(['easy'], results) def evaluate_fixed_states(self): results = [self.get_value(f_state) for f_state in list_of_fixes_states] self.neptune_monitor.log_state_values(results)
class QLearningTrainer: def __init__(self, alpha): self.agent = QValueAgent() self.env = gym_open_ai.make('splendor-v0') self.weights_token = 'weights_' + str(random.randint(0,1000000)) + '.h5' self.arena = Arena() self.alpha = alpha def _set_token(self, token): self.weights_token = token def _get_token(self): return self.weights_token def _save_weights(self): self.agent.model.save_weights(self.weights_token) def _load_weights(self): self.agent.model.load_weights(self.weights_token) def new_value_formula(self, old_value, best_value, winner_id, reward, alpha): if winner_id is not None: return reward if winner_id is None: if old_value and best_value is not None: return (1-alpha)*old_value + alpha*best_value else: return None def run_one_game_and_collect_data(self, debug_info=True): there_was_no_action = False self.agent.train_mode() last_actual_player_0 = None last_actual_player_1 = None last_state_player_0 = None last_state_player_1 = None last_action_vec_player_0 = None last_action_vec_player_1 = None old_value = None old_state = None old_action_vec = None self.env.reset() observation = self.env.show_observation('deterministic') is_done = False number_of_moves = 0 debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value')) collected_data = pd.DataFrame(columns=('state_as_vector', 'value')) extra_move_done = False while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES: if is_done: extra_move_done = True current_state_as_dict = StateAsDict(self.env.current_state_of_the_game) actual_action, actual_eval, best_eval = self.agent.choose_action(observation, [None]) if actual_action is None: there_was_no_action = True break #print('best value = {}'.format(best_value)) observation, reward, is_done, info = self.env.step('deterministic', actual_action) previous_player_id = self.env.previous_player_id() winner_id = info['winner_id'] if previous_player_id == 0: old_value = last_actual_player_0 old_state = last_state_player_0 old_action_vec = last_action_vec_player_0 if previous_player_id == 1: old_value = last_actual_player_1 old_state = last_state_player_1 old_action_vec = last_action_vec_player_1 if debug_info: state_status = old_state.__repr__() if old_state is not None else 'NONE' state_vector = vectorize_state(old_state) if old_state is not None else 'NONE' debug_collected_data = debug_collected_data.append({ 'state_ex' : state_status, 'state_vec' : state_vector, 'new_value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha), 'active_player_id' : self.env.previous_player_id(), 'winner_id' : winner_id, 'reward' : reward, 'best_eval' : best_eval, 'actual_eval' : actual_eval, 'old_value': old_value, 'pa_points' : self.env.previous_players_hand().number_of_my_points()}, ignore_index=True) if old_state is not None: collected_data = collected_data.append({'state_as_vector' : vectorize_state(old_state), 'action_vector' : old_action_vec, 'value': self.new_value_formula(old_value, best_eval, winner_id, reward, self.alpha)}, ignore_index=True) if previous_player_id == 0: last_actual_player_0 = actual_eval last_state_player_0 = current_state_as_dict last_action_vec_player_0 = vectorize_action(actual_action) if previous_player_id == 1: last_actual_player_1 = actual_eval last_state_player_1 = current_state_as_dict last_action_vec_player_1 = vectorize_action(actual_action) #let the opponent move: number_of_moves += 1 if debug_info: debug_collected_data.to_csv('debug_info.csv') collected_data = collected_data.iloc[0:] self.agent.test_mode() return collected_data def train_network(self, collected_data, epochs): #prepare X and Y for training: self.agent.model.train_model(data_frame=collected_data, epochs=epochs) def run_test(self, opponent: Agent): results = self.arena.run_many_duels('deterministic', [self.local_trainer.agent, opponent], n_games=comm.Get_size(), n_proc_per_agent=1, shuffle=True) if main_process: print(results) def run_training(self, n_iterations, opponent): if USE_NEPTUNE: neptune.create_experiment('Q learning alpha = '.format(self.alpha)) experience_replay_buffer = None for i in range(n_iterations): collected_data, there_was_no_action = self.run_one_game_and_collect_data(debug_info=True) if not there_was_no_action: self.agent.model.train_model(data_frame=collected_data, epochs=1) if experience_replay_buffer is None: experience_replay_buffer = collected_data else: experience_replay_buffer = experience_replay_buffer.append(collected_data) #Run test print('Game number = {}'.format(i)) if i%20 == 0 and i > 0: self.agent.model.train_model(data_frame=experience_replay_buffer, epochs=2) if i%100 == 0 and i > 0: experience_replay_buffer = None print('Clearing buffer') if i%10 == 0: if USE_NEPTUNE: neptune.send_metric('epsilon', x=self.agent.epsilon) results = self.arena.run_many_duels('deterministic', [self.agent, opponent], number_of_games=50) print(results) if USE_NEPTUNE: for pair in results.data.keys(): neptune.send_metric(pair[0] + '_wins', x=i, y=results.data[pair].wins) neptune.send_metric(pair[0] + '_reward', x=i, y=results.data[pair].reward) neptune.send_metric(pair[0] + '_victory_points', x=i, y=results.data[pair].victory_points) if USE_NEPTUNE: neptune.stop()
from agents.greedy_agent_boost import GreedyAgentBoost from agents.random_agent import RandomAgent from agents.single_mcts_agent import SingleMCTSAgent from arena.arena import Arena from monte_carlo_tree_search.evaluation_policies.heura_val import HeuraEvaluator arek = Arena() a1 = GreedyAgentBoost() a2 = SingleMCTSAgent(150, HeuraEvaluator(), 0.4, True, False) results = arek.run_many_duels('deterministic', [a1, a2], 1, True) print(results)
fight_pit = Arena() # time_profile = cProfile.Profile() # time_profile.run('fight_pit.run_many_duels([goku, gohan], number_of_games=100)') # time_profile.dump_stats('optimization1.prof') n_games = 10 gohan = GreedyAgentBoost(weight = [100,2,2,1,0.1]) print(gohan.name) goku = RandomAgent(distribution='uniform') print(fight_pit.run_many_duels("deterministic",[goku, gohan], number_of_games = n_games, shuffle_agents=True)) goku = RandomAgent(distribution='uniform_on_types') print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True)) goku = RandomAgent(distribution = 'first_buy') print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True)) gohan = GreedyAgentBoost(weight = [100,2.5,1.5,1,0.1]) print(gohan.name) goku = RandomAgent(distribution='uniform') print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True)) goku = RandomAgent(distribution='uniform_on_types')