def final_result(self, result: GameResult): """ This method is called once the game is over. If `self.training` is True, we execute a training run for the Neural Network. :param result: The result of the game that just finished. """ # Compute the final reward based on the game outcome if (result == GameResult.YELLOW_WIN and self.side == YELLOW) or ( result == GameResult.RED_WIN and self.side == RED): reward = self.win_value # type: float elif (result == GameResult.YELLOW_WIN and self.side == RED) or ( result == GameResult.RED_WIN and self.side == YELLOW): reward = self.loss_value # type: float elif result == GameResult.DRAW: reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) # The final reward is also the Q value we want to learn for the action that led to it. self.next_max_log.append(reward) # If we are in training mode we run the optimizer. if self.training: # We calculate our new estimate of what the true Q values are and feed that into the network as # learning target targets = self.calculate_targets() # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = [self.board_state_to_nn_input(x) for x in self.board_position_log] # We run the training step with the recorded inputs and new Q value targets. TFSN.get_session().run([self.nn.train_step], feed_dict={self.nn.input_positions: nn_input, self.nn.target_input: targets})
def get_probs(self, input_pos: np.ndarray) -> ([float], [float]): """ Feeds the feature vector `input_pos` which encodes a board state into the Neural Network and computes the Q values and corresponding probabilities for all moves (including illegal ones). :param input_pos: The feature vector to be fed into the Neural Network. :return: A tuple of probabilities and q values of all actions (including illegal ones). """ probs, qvalues = TFSN.get_session().run([self.nn.probabilities, self.nn.q_values], feed_dict={self.nn.input_positions: [input_pos]}) return probs[0], qvalues[0]
def get_probs(self, input_pos: [np.ndarray], network: QNetwork) -> ([float], [float]): """ Feeds the feature vectors `input_pos` (which encode a board states) into the Neural Network and computes the Q values and corresponding probabilities for all moves (including illegal ones). :param network: The network to get probabilities from :param input_pos: A list of feature vectors to be fed into the Neural Network. :return: A list of tuples of probabilities and q values of all actions (including illegal ones). """ probs, qvalues = TFSN.get_session().run( [network.probabilities, network.q_values], feed_dict={network.input_positions: input_pos}) return probs, qvalues
# from c4nn.SimpleNNQPlayer import NNQPlayer from c4nn.RndMinMaxAgent import RndMinMaxAgent # from c4nn.DirectPolicyAgent import DirectPolicyAgent from c4nn.DeepExpDoubleDuelQPlayer import DeepExpDoubleDuelQPlayer train = False dddplayer = DeepExpDoubleDuelQPlayer("DEDDPlayer1", win_value=10.0, loss_value=-10.0, learning_rate=0.001) rndplayer = RandomPlayer() rmmplayer = RndMinMaxAgent(3) TFSessionManager.set_session(tf.Session()) if not train: TFSessionManager.load_session('models/SimpleNNQPlayer') sess = TFSessionManager.get_session() if train: sess.run(tf.global_variables_initializer()) # num battles nb = 500 # games per battle gpb = 100 game_number, p1_wins, p2_wins, draws = evaluate_players(dddplayer,
import tensorflow as tf from util import play_game from c4nn.Board import Board, GameResult from c4nn.TFSessionManager import TFSessionManager from c4nn.HumanPlayer import HumanPlayer from c4nn.SimpleNNQPlayer import NNQPlayer from c4nn.RndMinMaxAgent import RndMinMaxAgent from c4nn.DeepExpDoubleDuelQPlayer import DeepExpDoubleDuelQPlayer from numpy import array board = Board() nnplayer = DeepExpDoubleDuelQPlayer("DEDDPlayer1", win_value=10.0, loss_value=-10.0, learning_rate=0.001, training=False) rndplayer = RndMinMaxAgent(3) TFSessionManager.set_session(tf.Session()) TFSessionManager.load_session('models/models_session2') sess = TFSessionManager.get_session() res = play_game(board, nnplayer, rndplayer, print_steps = True, reset_board=False, shift=False, slow=True) if res == GameResult.RED_WIN: txt = 'Red wins! (art. neural network)' elif res == GameResult.YELLOW_WIN: txt = 'Yellow wins! (lim. minmax algorithm)' else: txt = 'We got a draw!' print('\n'+txt+'\n')
def final_result(self, result: GameResult): """ This method is called once the game is over. If `self.training` is True, we execute a training run for the Neural Network. :param result: The result of the game that just finished. """ self.game_counter += 1 # Compute the final reward based on the game outcome if (result == GameResult.YELLOW_WIN and self.side == YELLOW) or (result == GameResult.RED_WIN and self.side == RED): reward = self.win_value # type: float elif (result == GameResult.YELLOW_WIN and self.side == RED) or (result == GameResult.RED_WIN and self.side == YELLOW): reward = self.loss_value # type: float elif result == GameResult.DRAW: reward = self.draw_value # type: float else: raise ValueError("Unexpected game result {}".format(result)) self.add_game_to_replay_buffer(reward) # If we are in training mode we run the optimizer. if self.training and (self.game_counter > self.pre_training_games): batch_third = self.batch_size // 3 train_batch = self.replay_buffer_win.sample(batch_third) train_batch.extend(self.replay_buffer_loss.sample(batch_third)) train_batch.extend(self.replay_buffer_draw.sample(batch_third)) train_batch = np.array(train_batch) #import pdb; pdb.set_trace() # # Let's compute the target q values for all non terminal move # We extract the resulting state, run it through the target net work and # get the maximum q value (of all valid moves) next_states = [s[2] for s in train_batch if s[2] is not None] target_qs = [] if len(next_states) > 0: probs, qvals = self.get_valid_probs( [self.board_state_to_nn_input(s) for s in next_states], self.target_net, [Board(s) for s in next_states]) i = 0 for t in train_batch: if t[2] is not None: max_move = np.argmax(probs[i]) max_qval = qvals[i][max_move] target_qs.append(max_qval * self.reward_discount) i += 1 else: target_qs.append(t[3]) if i != len(next_states): print("Something wrong here!!!") else: target_qs.extend(train_batch[:, 3]) # We convert the input states we have recorded to feature vectors to feed into the training. nn_input = [ self.board_state_to_nn_input(x[0]) for x in train_batch ] actions = train_batch[:, 1] # We run the training step with the recorded inputs and new Q value targets. summary, _ = TFSN.get_session().run( [self.q_net.merge, self.q_net.train_step], feed_dict={ self.q_net.input_positions: nn_input, self.q_net.target_q: target_qs, self.q_net.actions: actions }) self.random_move_prob *= self.random_move_decrease if self.writer is not None: self.writer.add_summary(summary, self.game_counter) summary = tf.Summary(value=[ tf.Summary.Value(tag='Random_Move_Probability', simple_value=self.random_move_prob) ]) self.writer.add_summary(summary, self.game_counter) TFSN.get_session().run(self.graph_copy_op)