Exemple #1
0
class StateEncoder(AbstractModel):
   def __init__(self,
                gems_encoder_dim : int = None,
                price_encoder_dim : int = None,
                profit_encoder_dim : int = None,
                cards_points_dim: int = None,
                cards_dense1_dim: int = None,
                cards_dense2_dim: int = None,
                board_nobles_dense1_dim : int = None,
                board_nobles_dense2_dim : int = None,
                full_board_dense1_dim: int = None,
                full_board_dense2_dim: int = None,
                player_points_dim: int = None,
                player_nobles_dim: int = None,
                full_player_dense1_dim: int = None,
                full_player_dense2_dim: int = None,
                final_layer= None,
                data_transformer = None,
                network_name: str = None
                ):
       super().__init__()
       self.vectorizer = Vectorizer()
       self.final_layer = final_layer
       self.data_transformer = data_transformer

       self.params['data transormation'] = self.data_transformer.name
       self.params['final layer name'] = self.final_layer.name
       self.params['gems_encoder_dim'] = gems_encoder_dim
       self.params['gems_encoder_dim'] = gems_encoder_dim
       self.params['price_encoder_dim'] = price_encoder_dim
       self.params['profit_encoder_dim'] = profit_encoder_dim
       self.params['cards_points_dim'] = cards_points_dim
       self.params['cards_dense1_dim'] = cards_dense1_dim
       self.params['cards_dense2_dim'] = cards_dense2_dim
       self.params['board_nobles_dense1_dim'] = board_nobles_dense1_dim
       self.params['board_nobles_dense2_dim'] = board_nobles_dense2_dim
       self.params['full_board_dense1_dim']= full_board_dense1_dim
       self.params['full_board_dense2_dim'] = full_board_dense2_dim
       self.params['player_points_dim'] = player_points_dim
       self.params['player_nobles_dim'] = player_nobles_dim
       self.params['full_player_dense1_dim'] = full_player_dense1_dim
       self.params['full_player_dense2_dim']= full_player_dense2_dim


       self.arena = Arena()
       self.network_agent = ValueNNAgent(self)
       self.easy_opp = RandomAgent(distribution='first_buy')
       self.medium_opp = GreedyAgentBoost()
       self.hard_opp = MinMaxAgent()


       self.neptune_monitor = NeptuneMonitor()
       self.network_name = network_name

       self.gems_encoder = GemsEncoder(gems_encoder_dim)
       self.price_encoder = PriceEncoder(price_encoder_dim)
       self.board_encoder = BoardEncoder(self.gems_encoder,
                                          ManyNoblesEncoder(price_encoder_dim,
                                                            board_nobles_dense1_dim,
                                                            board_nobles_dense2_dim),
                                          ManyCardsEncoder(MAX_CARDS_ON_BORD,
                                                           profit_encoder_dim,
                                                           price_encoder_dim,
                                                           cards_points_dim,
                                                           cards_dense1_dim,
                                                           cards_dense2_dim
                                                           ),
                                          full_board_dense1_dim,
                                          full_board_dense2_dim)
       self.player_encoder = PlayerEncoder(self.gems_encoder,
                                            self.price_encoder,
                                            ManyCardsEncoder(MAX_RESERVED_CARDS,
                                                             profit_encoder_dim,
                                                             price_encoder_dim,
                                                             cards_points_dim,
                                                             cards_dense1_dim,
                                                             cards_dense2_dim
                                                             ),
                                            player_points_dim,
                                            player_nobles_dim,
                                            full_player_dense1_dim,
                                            full_player_dense2_dim)
       active_player_input = PlayersInputGenerator('active_').inputs
       other_player_input = PlayersInputGenerator('other_').inputs
       board_input = self.board_encoder.inputs
       self.inputs = board_input + active_player_input + other_player_input
       board_encoded = self.board_encoder(board_input)
       active_player_encoded = self.player_encoder(active_player_input)
       other_player_encoded = self.player_encoder(other_player_input)
       full_state = Concatenate(axis=-1)([board_encoded, active_player_encoded, other_player_encoded])
       full_state = Dense(full_player_dense1_dim, activation='relu')(full_state)
       final_state = Dense(full_player_dense2_dim, activation='relu')(full_state)
       result = self.final_layer(final_state)
       self.layer = Model(inputs = self.inputs, outputs = final_state, name = 'full_state_splendor_estimator')
       self.network = Model(inputs = self.inputs, outputs = result, name = 'full_state_splendor_estimator')
       self.network.compile(Adam(), loss='mean_squared_error')
       self.params['Model name'] = 'Average pooling model'
       self.params['optimizer_name'] = 'Adam'

   def get_value(self, state):
       prediciton = self.network.predict(self.vectorizer.state_to_input(state))
       return self.final_layer.get_value(prediciton)

   def train_on_mcts_data(self, data_frame, train_epochs:int):
       X = data_frame['state']
       Y = data_frame['mcts_value']
       X = self.vectorizer.many_states_to_input(X)
       Y = self.data_transformer.transform_array(Y)
       fit_history = self.network.fit(X, Y, epochs=train_epochs)
       return fit_history


   def train_network_on_many_sets(self, train_dir=None, validation_file=None, epochs=None, batch_size=None,
                                  test_games=1):
       assert self.network is not None, 'You must create network before training'

       with open(validation_file, 'rb') as f:
           X_val, Y_val = pickle.load(f)

       X_val = self.vectorizer.many_states_to_input(X_val)
       Y_val = self.data_transformer.transform_array(Y_val)
       self.neptune_monitor.reset_epoch_counter()
       file1, file2 = self.gather_data_info(train_dir, validation_file)
       self.start_neptune_experiment(experiment_name=self.network_name, description='Training avg_pool arch network',
                                     neptune_monitor=self.neptune_monitor)
       self.neptune_monitor.log_histograms(file1, file2)
       files_for_training = os.listdir(train_dir)
       for epoch in range(epochs):
           print(f'\n Epoch {epoch}: \n')
           file_epoch = epoch % len(files_for_training)
           X, Y = load_data_for_model(os.path.join(train_dir, files_for_training[file_epoch]))
           X = self.vectorizer.many_states_to_input(X)
           Y = self.data_transformer.transform_array(Y)
           self.network.fit(x=X, y=Y, epochs=1, batch_size=batch_size,
                            validation_data=(X_val, Y_val),
                            callbacks=[self.neptune_monitor])
           del X
           del Y

       neptune.stop()

   def dump_weights(self, file_name):
       self.network.save_weights(file_name)

   def load_weights(self, file_name):
       self.network.load_weights(file_name)

   def gather_data_info(self, train_dir, validation_file):
       list_of_files = os.listdir(train_dir)
       example_file = list_of_files[0]
       with open(os.path.join(train_dir, example_file), 'rb') as f1:
           _, Y_ex = pickle.load(f1)
       with open(validation_file, 'rb') as f2:
           _, Y_val = pickle.load(f2)
       self.params['train set size'] = len(Y_ex)
       self.params['valid set size'] = len(Y_val)
       file1 = os.path.join('temp', 'train_hist.png')
       file2 = os.path.join('temp', 'valid_hist.png')
       Y_ex = self.data_transformer.transform_array(Y_ex)
       Y_val = self.data_transformer.transform_array(Y_val)
       plt.hist(Y_ex, bins=100)
       plt.savefig(file1)
       plt.clf()
       plt.hist(Y_val, bins=100)
       plt.savefig(file2)
       return file1, file2

   def check_performance(self, n_games, opponents):
       performance_results = {}
       if 'easy' in opponents:
           easy_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.easy_opp], n_games,
                                                    shuffle_agents=True)
           _, _, easy_win_rate = easy_results.return_stats()
           performance_results['easy'] = easy_win_rate / n_games
       if 'medium' in opponents:
           medium_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.medium_opp], n_games,
                                                    shuffle_agents=True)
           _, _, medium_win_rate = medium_results.return_stats()
           performance_results['medium'] = medium_win_rate / n_games
       if 'hard' in opponents:
           hard_results = self.arena.run_many_duels('deterministic', [self.network_agent, self.hard_opp], n_games,
                                                    shuffle_agents=True)
           _, _, hard_win_rate = hard_results.return_stats()
           performance_results['hard'] = hard_win_rate / n_games
       return performance_results

   def run_test(self, n_games):
       results = self.check_performance(n_games, ['easy'])
       self.neptune_monitor.log_win_rates(['easy'], results)

   def evaluate_fixed_states(self):
       results = [self.get_value(f_state) for f_state in list_of_fixes_states]
       self.neptune_monitor.log_state_values(results)
class QLearningTrainer:

    def __init__(self, alpha):
        self.agent = QValueAgent()
        self.env = gym_open_ai.make('splendor-v0')
        self.weights_token = 'weights_' + str(random.randint(0,1000000)) + '.h5'
        self.arena = Arena()
        self.alpha = alpha


    def _set_token(self, token):
        self.weights_token = token

    def _get_token(self):
        return self.weights_token

    def _save_weights(self):
        self.agent.model.save_weights(self.weights_token)

    def _load_weights(self):
        self.agent.model.load_weights(self.weights_token)

    def new_value_formula(self, old_value, best_value, winner_id, reward, alpha):
        if winner_id is not None:
            return reward
        if winner_id is None:
            if old_value and best_value is not None:
                return (1-alpha)*old_value + alpha*best_value
            else:
                return None


    def run_one_game_and_collect_data(self, debug_info=True):

        there_was_no_action = False
        self.agent.train_mode()
        last_actual_player_0 = None
        last_actual_player_1 = None
        last_state_player_0 = None
        last_state_player_1 = None
        last_action_vec_player_0 = None
        last_action_vec_player_1 = None
        old_value = None
        old_state = None
        old_action_vec = None
        self.env.reset()
        observation = self.env.show_observation('deterministic')
        is_done = False
        number_of_moves = 0

        debug_collected_data = pd.DataFrame(columns=('active_player_id', 'winner_id', 'reward', 'best_value'))
        collected_data = pd.DataFrame(columns=('state_as_vector', 'value'))
        extra_move_done = False

        while not (is_done and extra_move_done) and number_of_moves < MAX_NUMBER_OF_MOVES:

            if is_done:
                extra_move_done = True

            current_state_as_dict = StateAsDict(self.env.current_state_of_the_game)

            actual_action, actual_eval, best_eval = self.agent.choose_action(observation, [None])
            if actual_action is None:
                there_was_no_action = True
                break
            #print('best value = {}'.format(best_value))
            observation, reward, is_done, info = self.env.step('deterministic', actual_action)
            previous_player_id = self.env.previous_player_id()
            winner_id = info['winner_id']



            if previous_player_id == 0:
                old_value = last_actual_player_0
                old_state = last_state_player_0
                old_action_vec = last_action_vec_player_0

            if previous_player_id == 1:
                old_value = last_actual_player_1
                old_state = last_state_player_1
                old_action_vec = last_action_vec_player_1

            if debug_info:
                state_status = old_state.__repr__() if old_state is not None else 'NONE'
                state_vector = vectorize_state(old_state) if old_state is not None else 'NONE'
                debug_collected_data = debug_collected_data.append({
                                                        'state_ex' : state_status,
                                                        'state_vec' : state_vector,
                                                        'new_value': self.new_value_formula(old_value, best_eval,
                                                                                            winner_id, reward, self.alpha),
                                                        'active_player_id' : self.env.previous_player_id(),
                                                        'winner_id' : winner_id,
                                                        'reward' : reward,
                                                        'best_eval' : best_eval,
                                                        'actual_eval' : actual_eval,
                                                        'old_value': old_value,
                                                        'pa_points' : self.env.previous_players_hand().number_of_my_points()},
                                                        ignore_index=True)


            if old_state is not None:
                collected_data = collected_data.append({'state_as_vector' : vectorize_state(old_state),
                                                        'action_vector' : old_action_vec,
                                                            'value': self.new_value_formula(old_value, best_eval,
                                                                                                winner_id, reward, self.alpha)},
                                                       ignore_index=True)



            if previous_player_id == 0:
                last_actual_player_0 = actual_eval
                last_state_player_0 = current_state_as_dict
                last_action_vec_player_0 = vectorize_action(actual_action)
            if previous_player_id == 1:
                last_actual_player_1 = actual_eval
                last_state_player_1 = current_state_as_dict
                last_action_vec_player_1 = vectorize_action(actual_action)

            #let the opponent move:
            number_of_moves += 1

        if debug_info:
            debug_collected_data.to_csv('debug_info.csv')
        collected_data = collected_data.iloc[0:]
        self.agent.test_mode()
        return collected_data

    def train_network(self, collected_data, epochs):
        #prepare X and Y for training:
        self.agent.model.train_model(data_frame=collected_data, epochs=epochs)

    def run_test(self, opponent: Agent):
        results = self.arena.run_many_duels('deterministic', [self.local_trainer.agent, opponent], n_games=comm.Get_size(),
                                        n_proc_per_agent=1, shuffle=True)
        if main_process:
            print(results)

    def run_training(self, n_iterations, opponent):

        if USE_NEPTUNE:
            neptune.create_experiment('Q learning alpha = '.format(self.alpha))
        experience_replay_buffer = None
        for i in range(n_iterations):
            collected_data, there_was_no_action = self.run_one_game_and_collect_data(debug_info=True)
            if not there_was_no_action:
                self.agent.model.train_model(data_frame=collected_data, epochs=1)
                if experience_replay_buffer is None:
                    experience_replay_buffer = collected_data
                else:
                    experience_replay_buffer = experience_replay_buffer.append(collected_data)
            #Run test
            print('Game number = {}'.format(i))
            if i%20 == 0 and i > 0:
                self.agent.model.train_model(data_frame=experience_replay_buffer, epochs=2)

            if i%100 == 0 and i > 0:
                experience_replay_buffer = None
                print('Clearing buffer')

            if i%10 == 0:
                if USE_NEPTUNE:
                    neptune.send_metric('epsilon', x=self.agent.epsilon)
                results = self.arena.run_many_duels('deterministic', [self.agent, opponent], number_of_games=50)
                print(results)
                if USE_NEPTUNE:
                    for pair in results.data.keys():
                        neptune.send_metric(pair[0] + '_wins', x=i, y=results.data[pair].wins)
                        neptune.send_metric(pair[0] + '_reward', x=i, y=results.data[pair].reward)
                        neptune.send_metric(pair[0] + '_victory_points', x=i, y=results.data[pair].victory_points)



        if USE_NEPTUNE:
            neptune.stop()
from agents.greedy_agent_boost import GreedyAgentBoost
from agents.random_agent import RandomAgent
from agents.single_mcts_agent import SingleMCTSAgent
from arena.arena import Arena
from monte_carlo_tree_search.evaluation_policies.heura_val import HeuraEvaluator

arek = Arena()
a1 = GreedyAgentBoost()
a2 = SingleMCTSAgent(150, HeuraEvaluator(), 0.4, True, False)

results = arek.run_many_duels('deterministic', [a1, a2], 1, True)
print(results)
Exemple #4
0
fight_pit = Arena()


# time_profile = cProfile.Profile()
# time_profile.run('fight_pit.run_many_duels([goku, gohan], number_of_games=100)')
# time_profile.dump_stats('optimization1.prof')


n_games = 10

gohan = GreedyAgentBoost(weight = [100,2,2,1,0.1])
print(gohan.name)

goku = RandomAgent(distribution='uniform')
print(fight_pit.run_many_duels("deterministic",[goku, gohan], number_of_games = n_games, shuffle_agents=True))

goku = RandomAgent(distribution='uniform_on_types')
print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True))

goku = RandomAgent(distribution = 'first_buy')
print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True))


gohan = GreedyAgentBoost(weight = [100,2.5,1.5,1,0.1])
print(gohan.name)

goku = RandomAgent(distribution='uniform')
print(fight_pit.run_many_duels([goku, gohan], number_of_games = n_games, shuffle_agents=True))

goku = RandomAgent(distribution='uniform_on_types')