Exemple #1
0
    def final_result(self, result: GameResult):
        """
        This method is called once the game is over. If `self.training` is True, we execute a training run for
        the Neural Network.
        :param result: The result of the game that just finished.
        """

        # Compute the final reward based on the game outcome
        if (result == GameResult.YELLOW_WIN and self.side == YELLOW) or (
                result == GameResult.RED_WIN and self.side == RED):
            reward = self.win_value  # type: float
        elif (result == GameResult.YELLOW_WIN and self.side == RED) or (
                result == GameResult.RED_WIN and self.side == YELLOW):
            reward = self.loss_value  # type: float
        elif result == GameResult.DRAW:
            reward = self.draw_value  # type: float
        else:
            raise ValueError("Unexpected game result {}".format(result))

        # The final reward is also the Q value we want to learn for the action that led to it.
        self.next_max_log.append(reward)

        # If we are in training mode we run the optimizer.
        if self.training:
            # We calculate our new estimate of what the true Q values are and feed that into the network as
            # learning target
            targets = self.calculate_targets()

            # We convert the input states we have recorded to feature vectors to feed into the training.
            nn_input = [self.board_state_to_nn_input(x) for x in self.board_position_log]

            # We run the training step with the recorded inputs and new Q value targets.
            TFSN.get_session().run([self.nn.train_step],
                                   feed_dict={self.nn.input_positions: nn_input, self.nn.target_input: targets})
Exemple #2
0
 def get_probs(self, input_pos: np.ndarray) -> ([float], [float]):
     """
     Feeds the feature vector `input_pos` which encodes a board state into the Neural Network and computes the
     Q values and corresponding probabilities for all moves (including illegal ones).
     :param input_pos: The feature vector to be fed into the Neural Network.
     :return: A tuple of probabilities and q values of all actions (including illegal ones).
     """
     probs, qvalues = TFSN.get_session().run([self.nn.probabilities, self.nn.q_values],
                                             feed_dict={self.nn.input_positions: [input_pos]})
     return probs[0], qvalues[0]
Exemple #3
0
 def get_probs(self, input_pos: [np.ndarray],
               network: QNetwork) -> ([float], [float]):
     """
     Feeds the feature vectors `input_pos` (which encode a board states) into the Neural Network and computes the
     Q values and corresponding probabilities for all moves (including illegal ones).
     :param network: The network to get probabilities from
     :param input_pos: A list of feature vectors to be fed into the Neural Network.
     :return: A list of tuples of probabilities and q values of all actions (including illegal ones).
     """
     probs, qvalues = TFSN.get_session().run(
         [network.probabilities, network.q_values],
         feed_dict={network.input_positions: input_pos})
     return probs, qvalues
Exemple #4
0
# from c4nn.SimpleNNQPlayer import NNQPlayer
from c4nn.RndMinMaxAgent import RndMinMaxAgent
# from c4nn.DirectPolicyAgent import DirectPolicyAgent
from c4nn.DeepExpDoubleDuelQPlayer import DeepExpDoubleDuelQPlayer

train = False

dddplayer = DeepExpDoubleDuelQPlayer("DEDDPlayer1",
                                     win_value=10.0,
                                     loss_value=-10.0,
                                     learning_rate=0.001)

rndplayer = RandomPlayer()
rmmplayer = RndMinMaxAgent(3)

TFSessionManager.set_session(tf.Session())

if not train:
    TFSessionManager.load_session('models/SimpleNNQPlayer')

sess = TFSessionManager.get_session()

if train:
    sess.run(tf.global_variables_initializer())

# num battles
nb = 500
# games per battle
gpb = 100

game_number, p1_wins, p2_wins, draws = evaluate_players(dddplayer,
Exemple #5
0
import tensorflow as tf

from util import play_game
from c4nn.Board import Board, GameResult
from c4nn.TFSessionManager import TFSessionManager
from c4nn.HumanPlayer import HumanPlayer
from c4nn.SimpleNNQPlayer import NNQPlayer
from c4nn.RndMinMaxAgent import RndMinMaxAgent
from c4nn.DeepExpDoubleDuelQPlayer import DeepExpDoubleDuelQPlayer
from numpy import array

board = Board()

nnplayer = DeepExpDoubleDuelQPlayer("DEDDPlayer1", win_value=10.0, loss_value=-10.0, learning_rate=0.001, training=False)
rndplayer = RndMinMaxAgent(3)
TFSessionManager.set_session(tf.Session())
TFSessionManager.load_session('models/models_session2')

sess = TFSessionManager.get_session()

res = play_game(board, nnplayer, rndplayer, print_steps = True, reset_board=False, shift=False, slow=True)

if res == GameResult.RED_WIN:
    txt = 'Red wins! (art. neural network)'
elif res == GameResult.YELLOW_WIN:
    txt = 'Yellow wins! (lim. minmax algorithm)'
else:
    txt = 'We got a draw!'

print('\n'+txt+'\n')
Exemple #6
0
    def final_result(self, result: GameResult):
        """
        This method is called once the game is over. If `self.training` is True, we execute a training run for
        the Neural Network.
        :param result: The result of the game that just finished.
        """

        self.game_counter += 1

        # Compute the final reward based on the game outcome
        if (result == GameResult.YELLOW_WIN
                and self.side == YELLOW) or (result == GameResult.RED_WIN
                                             and self.side == RED):
            reward = self.win_value  # type: float
        elif (result == GameResult.YELLOW_WIN
              and self.side == RED) or (result == GameResult.RED_WIN
                                        and self.side == YELLOW):
            reward = self.loss_value  # type: float
        elif result == GameResult.DRAW:
            reward = self.draw_value  # type: float
        else:
            raise ValueError("Unexpected game result {}".format(result))

        self.add_game_to_replay_buffer(reward)

        # If we are in training mode we run the optimizer.
        if self.training and (self.game_counter > self.pre_training_games):

            batch_third = self.batch_size // 3
            train_batch = self.replay_buffer_win.sample(batch_third)
            train_batch.extend(self.replay_buffer_loss.sample(batch_third))
            train_batch.extend(self.replay_buffer_draw.sample(batch_third))
            train_batch = np.array(train_batch)
            #import pdb; pdb.set_trace()
            #
            # Let's compute the target q values for all non terminal move
            # We extract the resulting state, run it through the target net work and
            # get the maximum q value (of all valid moves)
            next_states = [s[2] for s in train_batch if s[2] is not None]
            target_qs = []

            if len(next_states) > 0:
                probs, qvals = self.get_valid_probs(
                    [self.board_state_to_nn_input(s) for s in next_states],
                    self.target_net, [Board(s) for s in next_states])

                i = 0
                for t in train_batch:
                    if t[2] is not None:
                        max_move = np.argmax(probs[i])
                        max_qval = qvals[i][max_move]
                        target_qs.append(max_qval * self.reward_discount)
                        i += 1
                    else:
                        target_qs.append(t[3])

                if i != len(next_states):
                    print("Something wrong here!!!")
            else:
                target_qs.extend(train_batch[:, 3])

            # We convert the input states we have recorded to feature vectors to feed into the training.
            nn_input = [
                self.board_state_to_nn_input(x[0]) for x in train_batch
            ]
            actions = train_batch[:, 1]

            # We run the training step with the recorded inputs and new Q value targets.
            summary, _ = TFSN.get_session().run(
                [self.q_net.merge, self.q_net.train_step],
                feed_dict={
                    self.q_net.input_positions: nn_input,
                    self.q_net.target_q: target_qs,
                    self.q_net.actions: actions
                })
            self.random_move_prob *= self.random_move_decrease

            if self.writer is not None:
                self.writer.add_summary(summary, self.game_counter)
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='Random_Move_Probability',
                                     simple_value=self.random_move_prob)
                ])
                self.writer.add_summary(summary, self.game_counter)

            TFSN.get_session().run(self.graph_copy_op)