def load_network_player(network_filename, hidden_layers): session = tf.Session() input_layer, output_layer, variables = network_helpers.create_network( game_spec.board_squares(), hidden_layers) network_helpers.load_network(session, variables, network_filename) def network_player(board_state, side): print print "Network player (%s)" % side tic_tac_toe.print_game_state(board_state) move_probs = network_helpers.get_stochastic_network_move( session, input_layer, output_layer, board_state, side, log=True) move = game_spec.flat_move_to_tuple(move_probs.argmax()) print "Network move:", move return move return network_player
def test_load_variables_into_network_of_wrong_size_gives_friendly_exception( self): try: file_name = 'test.p' input_nodes = 20 _, _, variables1 = create_network(input_nodes, (30, )) _, _, variables2 = create_network(input_nodes, (40, )) with tf.Session() as session: session.run(tf.global_variables_initializer()) save_network(session, variables1, file_name) with self.assertRaises(ValueError): load_network(session, variables2, file_name) finally: try: os.remove(file_name) except OSError: pass
def test_save_and_load_network(self): try: file_name = 'test.p' input_nodes = 20 hidden_nodes = (50, 40, 30) _, _, variables1 = create_network(input_nodes, hidden_nodes) _, _, variables2 = create_network(input_nodes, hidden_nodes) with tf.Session() as session: session.run(tf.global_variables_initializer()) save_network(session, variables1, file_name) load_network(session, variables2, file_name) for var1, var2 in zip(variables1, variables2): np.testing.assert_array_almost_equal( session.run(var1), session.run(var2)) finally: try: os.remove(file_name) except OSError: pass
def predict_best_move_low_level(game_spec, create_network, network_file_path, player, board_state): """Make a predicition for the next move at a given state using some lower level parameters Args: create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. network_file_path (str): path to the file with weights we want to load for this network game_spec (games.base_game_spec.BaseGameSpec): The game we are playing player: The player to make the move 1 or -1 board_state: The state of the board at some time during the game Returns: a vector of zeros with a 1 on the position which represents the best move to be taken """ reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs())) input_layer, output_layer, variables = create_network() policy_gradient = tf.log( tf.reduce_sum(tf.mul(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("Loading trained network from ", network_file_path) load_network(session, variables, network_file_path) else: print("File with trained network can't be loaded. Exiting...'") return return get_stochastic_network_move(session, input_layer, output_layer, board_state, player)
def train_supervised(game_spec, create_network, network_file_path, positions, test_set_ratio=0.4, regularization_coefficent=1e-5, batch_size=100, learn_rate=1e-4, stop_turns_without_improvement=7): """Train a network using supervised learning using against a list of game positions and moves chosen. We stop after we have had stop_turns_without_improvement without an improvement in the test error. The test set is used as a validation set as well, will possibly improve this in the future to have a seperate test and validation set. Args: stop_turns_without_improvement (int): we stop training after this many iterations without any improvement in the test error. regularization_coefficent (float): amount to multiply the l2 regularizer by in the loss function test_set_ratio (float): portion of the data to divide into the test set, positions ([(board_state, move)]): list of tuples of board states and the moves chosen in those board_states game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. network_file_path (str): path to the file with weights we want to load for this network learn_rate (float): batch_size (int): Returns: episode_number, train_error, train_accuracy, new_test_error, test_accuracy """ input_layer, output_layer, variables = create_network() test_set_count = int(len(positions) * test_set_ratio) train_set = positions[:-test_set_count] test_set = positions[-test_set_count:] actual_move_placeholder = tf.compat.v1.placeholder( "float", (None, game_spec.outputs())) error = tf.reduce_sum(input_tensor=tf.square(actual_move_placeholder - output_layer)) regularizer = None for var in variables: if regularizer is None: regularizer = tf.nn.l2_loss(var) else: regularizer += tf.nn.l2_loss(var) loss = error + regularizer * regularization_coefficent train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize(loss) correct_pred = tf.equal(tf.argmax(input=output_layer, axis=1), tf.argmax(input=actual_move_placeholder, axis=1)) accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_pred, tf.float32)) with tf.compat.v1.Session() as session: session.run(tf.compat.v1.global_variables_initializer()) if os.path.isfile(network_file_path): print("loading existing network") load_network(session, variables, network_file_path) episode_number = 1 turns_without_test_improvement = 0 best_test_error, test_accuracy = session.run( [error, accuracy], feed_dict={ input_layer: [x[0] for x in test_set], actual_move_placeholder: [x[1] for x in test_set] }) while True: random.shuffle(train_set) train_error = 0 for start_index in range(0, len(train_set) - batch_size + 1, batch_size): mini_batch = train_set[start_index:start_index + batch_size] batch_error, _ = session.run( [error, train_step], feed_dict={ input_layer: [x[0] for x in mini_batch], actual_move_placeholder: [x[1] for x in mini_batch] }) train_error += batch_error new_test_error, test_accuracy = session.run( [error, accuracy], feed_dict={ input_layer: [x[0] for x in test_set], actual_move_placeholder: [x[1] for x in test_set] }) print("episode: %s train_error: %s test_error: %s test_acc: %s" % (episode_number, train_error, new_test_error, test_accuracy)) if new_test_error < best_test_error: best_test_error = new_test_error turns_without_test_improvement = 0 else: turns_without_test_improvement += 1 if turns_without_test_improvement > stop_turns_without_improvement: train_accuracy = session.run( [accuracy], feed_dict={ input_layer: [x[0] for x in train_set], actual_move_placeholder: [x[1] for x in train_set] }) print( "test error not improving for %s turns, ending training" % (stop_turns_without_improvement, )) break episode_number += 1 print( "final episode: %s train_error: %s train acc: %s test_error: %s test_acc: %s" % (episode_number, train_error, train_accuracy, new_test_error, test_accuracy)) save_network(session, variables, network_file_path) return episode_number, train_error, train_accuracy, new_test_error, test_accuracy
def train_policy_gradients_vs_historic( game_spec, create_network, network_file_path, save_network_file_path=None, number_of_historic_networks=8, save_historic_every=10000, historic_network_base_path='historic_network', number_of_games=100000, print_results_every=1000, learn_rate=1e-4, batch_size=100): """Train a network against itself and over time store new version of itself to play against. Args: historic_network_base_path (str): Bast path to save new historic networks to a number for the network "slot" is appended to the end of this string. save_historic_every (int): We save a version of the learning network into one of the historic network "slots" every x number of games. We have number_of_historic_networks "slots" number_of_historic_networks (int): We keep this many old networks to play against save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then the network_file_path param is used. game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. network_file_path (str): path to the file with weights we want to load for this network number_of_games (int): number of games to play before stopping print_results_every (int): Prints results to std out every x games, also saves the network learn_rate (float): batch_size (int): Returns: [tf.Vaiables] : trained variables used in the final network """ input_layer, output_layer, variables = create_network() reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) policy_gradient = tf.reduce_sum( tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) train_step = tf.train.RMSPropOptimizer(learn_rate).minimize( -policy_gradient) current_historical_index = 0 historical_networks = [] mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=print_results_every) for _ in range(number_of_historic_networks): historical_input_layer, historical_output_layer, historical_variables = create_network( ) historical_networks.append( (historical_input_layer, historical_output_layer, historical_variables)) with tf.Session() as session: session.run(tf.global_variables_initializer()) def make_move_historical(histoical_network_index, board_state, side): net = historical_networks[histoical_network_index] move = get_stochastic_network_move(session, net[0], net[1], board_state, side, valid_only=True, game_spec=game_spec) return game_spec.flat_move_to_tuple(move.argmax()) def make_training_move(board_state, side): mini_batch_board_states.append(np.ravel(board_state) * side) move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec) mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move.argmax()) if os.path.isfile(network_file_path): print("loading pre existing weights") load_network(session, variables, network_file_path) else: print("could not find previous weights so initialising randomly") for i in range(number_of_historic_networks): if os.path.isfile(historic_network_base_path + str(i) + '.p'): load_network(session, historical_networks[i][2], historic_network_base_path + str(i) + '.p') elif os.path.isfile(network_file_path): # if we can't load a historical file use the current network weights load_network(session, historical_networks[i][2], network_file_path) for episode_number in range(1, number_of_games): opponent_index = random.randint(0, number_of_historic_networks - 1) make_move_historical_for_index = functools.partial( make_move_historical, opponent_index) # randomize if going first or second if bool(random.getrandbits(1)): reward = game_spec.play_game(make_training_move, make_move_historical_for_index) else: reward = -game_spec.play_game(make_move_historical_for_index, make_training_move) results.append(reward) # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick last_game_length = len(mini_batch_board_states) - len( mini_batch_rewards) reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) episode_number += 1 if episode_number % batch_size == 0: normalized_rewards = mini_batch_rewards - np.mean( mini_batch_rewards) rewards_std = np.std(normalized_rewards) if rewards_std != 0: normalized_rewards /= rewards_std else: print("warning: got mini batch std of 0.") np_mini_batch_board_states = np.array(mini_batch_board_states) \ .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) session.run(train_step, feed_dict={ input_layer: np_mini_batch_board_states, reward_placeholder: normalized_rewards, actual_move_placeholder: mini_batch_moves }) # clear batches del mini_batch_board_states[:] del mini_batch_moves[:] del mini_batch_rewards[:] if episode_number % print_results_every == 0: print("episode: %s average result: %s" % (episode_number, np.mean(results))) if episode_number % save_historic_every == 0: print("saving historical network %s", current_historical_index) save_network( session, variables, historic_network_base_path + str(current_historical_index) + '.p') load_network( session, historical_networks[current_historical_index][2], historic_network_base_path + str(current_historical_index) + '.p') # also save to the main network file save_network(session, variables, save_network_file_path or network_file_path) current_historical_index += 1 current_historical_index %= number_of_historic_networks # save our final weights save_network(session, variables, save_network_file_path or network_file_path) return variables
def train_value_network(game_spec, hidden_nodes_reinforcement, reinforcement_network_file_path, hidden_nodes_value, value_network_file_path, learn_rate=1e-4, batch_size=100, train_samples=10000, test_samples=8000): reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( game_spec.board_squares(), hidden_nodes_reinforcement, game_spec.outputs()) value_input_layer, value_output_layer, value_variables = create_network( game_spec.board_squares(), hidden_nodes_value, output_nodes=1, output_softmax=False) target_placeholder = tf.compat.v1.placeholder("float", (None, 1)) error = tf.reduce_sum(input_tensor=tf.square(target_placeholder - value_output_layer)) train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize( error) with tf.compat.v1.Session() as session: session.run(tf.compat.v1.global_variables_initializer()) load_network(session, reinforcement_variables, reinforcement_network_file_path) if os.path.isfile(value_network_file_path): print("loading previous version of value network") load_network(session, value_variables, value_network_file_path) def make_move(board_state, side): move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer, board_state, side) return game_spec.flat_move_to_tuple(np.argmax(move)) board_states_training = {} board_states_test = [] episode_number = 0 while len(board_states_training) < train_samples + test_samples: board_state = _generate_random_board_position( game_spec, (1, game_spec.board_squares() * 0.8)) board_state_flat = tuple(np.ravel(board_state)) # only accept the board_state if not already in the dict if board_state_flat not in board_states_training: result = game_spec.play_game(make_move, make_move, board_state=board_state) board_states_training[board_state_flat] = float(result) # take a random selection from training into a test set for _ in range(test_samples): sample = random.choice(board_states_training.keys()) board_states_test.append((sample, board_states_training[sample])) del board_states_training[sample] board_states_training = list(board_states_training.iteritems()) test_error = session.run(error, feed_dict={ value_input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) while True: np.random.shuffle(board_states_training) train_error = 0 for start_index in range( 0, len(board_states_training) - batch_size + 1, batch_size): mini_batch = board_states_training[start_index:start_index + batch_size] batch_error, _ = session.run( [error, train_step], feed_dict={ value_input_layer: [x[0] for x in mini_batch], target_placeholder: [[x[1]] for x in mini_batch] }) train_error += batch_error new_test_error = session.run(error, feed_dict={ value_input_layer: [x[0] for x in board_states_test], target_placeholder: [[x[1]] for x in board_states_test] }) print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error)) if new_test_error > test_error: print("train error went up, stopping training") break test_error = new_test_error episode_number += 1 save_network(session, value_variables, value_network_file_path)
def train_policy_gradients_vs_historic( game_spec, create_network, load_network_file_path, save_network_file_path=None, number_of_historic_networks=1, historic_network_base_path='historic_network', number_of_games=10000, update_opponent_winrate=0.65, print_results_every=100, learn_rate=1e-3, batch_size=100, cnn_on=False, eps=0.1, deterministic=True, mcts=False, min_win_ticks=3, beta=0.01): """Train a network against itself and over time store new version of itself to play against. Args: historic_network_base_path (str): Base path to save new historic networks to a number for the network "slot" is appended to the end of this string. save_historic_every (int): We save a version of the learning network into one of the historic network "slots" every x number of games. We have number_of_historic_networks "slots" number_of_historic_networks (int): We keep this many old networks to play against save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then the load_network_file_path param is used. game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. load_network_file_path (str): path to the file with weights we want to load for this network number_of_games (int): number of games to play before stopping update_opponent_winrate (float): the required winrate before updating the opponent to the newer agent print_results_every (int): Prints results to std out every x games, also saves the network learn_rate (float): batch_size (int): cnn_on (bool): use convolutional or regular neural network eps (float): fraction of moves made randomly deterministic (bool): use deterministic or stochastic move selection min_win_ticks (int): number of times the networks winrate needs to exceed update_opponent_winrate to update Returns: [tf.Variables] : trained variables used in the final network """ save_network_file_path = save_network_file_path or load_network_file_path # create folder if it does not exist if save_network_file_path: split = save_network_file_path.split('/') directory = '/'.join(split[:-1]) or '.' if not os.path.isdir(directory): os.makedirs(directory) print("created directory " + directory) reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs())) input_layer, output_layer, variables, weights = create_network() baseline = np.zeros([100, 1]) baselineCounter = 0 policy_gradient = tf.log( tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), axis=1)) * (reward_placeholder - np.mean(baseline)) #policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) #Original one from historic #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient) # Why is this one different from the other train policy grad? regularizer = sum([tf.nn.l2_loss(i) for i in weights]) train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient + beta * regularizer) current_historical_index = 0 # We will (probably) not use this: we always train against the most recent agent historical_networks = [] mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=print_results_every) for _ in range(number_of_historic_networks): historical_input_layer, historical_output_layer, historical_variables, _ = create_network( ) historical_networks.append( (historical_input_layer, historical_output_layer, historical_variables)) with tf.Session() as session: session.run(tf.global_variables_initializer()) #testvar = variables[0] #\\ test for checking variables #print(session.run(variables[0])) base_episode_number = 0 winrates = [] if load_network_file_path and os.path.isfile(load_network_file_path): print("loading pre-existing network") load_network(session, variables, load_network_file_path) base_episode_number, winrates = load_results( load_network_file_path) else: print('Creating new network') def make_move_historical(historical_network_index, board_state, side): net = historical_networks[historical_network_index] #move = get_stochastic_network_move(session, net[0], net[1], board_state, side, # valid_only=True, game_spec=game_spec, CNN_ON=cnn_on) if mcts: _, move = monte_carlo_tree_search(game_spec, board_state, side, 27, session, input_layer, output_layer, True, cnn_on, True) else: move = get_deterministic_network_move(session, net[0], net[1], board_state, side, valid_only=True, game_spec=game_spec, cnn_on=cnn_on) move_for_game = np.asarray( move) # move must be an array, mcts doesn't return this return game_spec.flat_move_to_tuple(move_for_game.argmax()) def make_training_move(board_state, side): if cnn_on: np_board_state = create_3x3_board_states(board_state) else: np_board_state = np.array(board_state) mini_batch_board_states.append(np_board_state * side) rand_numb = random.uniform(0., 1.) if rand_numb < eps: move = get_random_network_move(board_state, game_spec) elif deterministic: move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec, cnn_on=cnn_on) else: if mcts: _, move = monte_carlo_tree_search(game_spec, board_state, side, 27, session, input_layer, output_layer, True, cnn_on, True) else: move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec, cnn_on=cnn_on) move_for_game = np.asarray( move ) # The move returned to the game is in a different configuration than the CNN learn move if cnn_on: # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state), # the same should happen for the mini batch moves move = create_3x3_board_states(np.reshape( move, [9, 9])) # The function requires a 9x9 array mini_batch_moves.append(move[0:81]) else: mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move_for_game.argmax()) #for i in range(number_of_historic_networks): if os.path.isfile(historic_network_base_path + str(0) + '.p'): load_network(session, historical_networks[0][2], historic_network_base_path + str(0) + '.p') print('Historic network loaded') else: # if we can't load a historical file use the current network weights print( 'Warning: loading historical file failed. Current net is saved and being used as historic net.' ) historic_filename = historic_network_base_path + str( current_historical_index) + '.p' save_network(session, variables, historic_filename) load_network(session, historical_networks[current_historical_index][2], historic_filename) win_ticks = 0 # registers the amount of times the agent has a high enough winrate to update its opponent for episode_number in range(1, number_of_games): opponent_index = random.randint(0, number_of_historic_networks - 1) make_move_historical_for_index = functools.partial( make_move_historical, opponent_index) # randomize if going first or second if bool(random.getrandbits(1)): reward = game_spec.play_game(make_training_move, make_move_historical_for_index) else: reward = -game_spec.play_game(make_move_historical_for_index, make_training_move) results.append(reward) baseline[baselineCounter] = reward baselineCounter += 1 baselineCounter = baselineCounter % 100 # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick last_game_length = len(mini_batch_board_states) - len( mini_batch_rewards) reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) episode_number += 1 if episode_number % batch_size == 0: normalized_rewards = mini_batch_rewards - np.mean( mini_batch_rewards) rewards_std = np.std(normalized_rewards) if rewards_std != 0: normalized_rewards /= rewards_std else: print("warning: got mini batch std of 0.") np_mini_batch_board_states = np.array(mini_batch_board_states) \ .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) session.run(train_step, feed_dict={ input_layer: np_mini_batch_board_states, reward_placeholder: normalized_rewards, actual_move_placeholder: mini_batch_moves }) # clear batches del mini_batch_board_states[:] del mini_batch_moves[:] del mini_batch_rewards[:] if episode_number % print_results_every == 0: winrate = _win_rate(print_results_every, results) if winrate == 0: print('DEBUG TEST') winrates.append( [base_episode_number + episode_number, winrate]) print("episode: %s win_rate: %s" % (base_episode_number + episode_number, winrate)) if save_network_file_path: save_network( session, variables, time.strftime(save_network_file_path[:-2] + "_ep" + str(base_episode_number + episode_number) + "_%Y-%m-%d_%H%M%S.p")) # Update opponent when winrate is high enough and it happens for a longer period if (episode_number % print_results_every == 0) and (winrate >= update_opponent_winrate): win_ticks += 1 if win_ticks >= min_win_ticks: win_ticks = 0 first_bot = False print("saving historical network %s at episode %s." % (current_historical_index, base_episode_number + episode_number) ) # Overwrite historic opponent with current network historic_filename = historic_network_base_path + str( current_historical_index) + '.p' save_network(session, variables, historic_filename) load_network( session, historical_networks[current_historical_index][2], historic_filename) # also save to the main network file save_network(session, variables, (save_network_file_path or load_network_file_path)[:-2] + "_ep" + str(base_episode_number + episode_number) + ".p") current_historical_index += 1 # Not used when we only have 1 historic network current_historical_index %= number_of_historic_networks # save our final weights save_network(session, variables, save_network_file_path or load_network_file_path) return variables, _win_rate(print_results_every, results), winrates
def benchmark(game_spec, network_file_path, create_network_func, log_games=False, games_vs_random=500): """Plays games against a variety of algorithms to see how good a network is. Results are currently just printed to std out Args: game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network_func (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. network_file_path (str): path to the file with weights we want to load for this network log_games (bool): If True print all positions from all games played games_vs_random (int): Number of games to play vs random opponents """ input_layer, output_layer, variables = create_network_func() with tf.Session() as session: session.run(tf.initialize_all_variables()) load_network(session, variables, network_file_path) def make_move(board_state, side): move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec) return game_spec.flat_move_to_tuple(move.argmax()) def min_max_move_func(board_state, side, depth): return min_max_alpha_beta(game_spec, board_state, side, depth)[1] def monte_carlo_move_func(board_state, side): return monte_carlo_tree_search_uct(game_spec, board_state, side, 100000)[1] results = [] for _ in range(int(games_vs_random / 2)): result = game_spec.play_game(make_move, game_spec.get_random_player_func(), log=log_games) results.append(result) result = game_spec.play_game(game_spec.get_random_player_func(), make_move, log=log_games) results.append(-result) print("*** results vs random = %s" % (sum(results), )) results = [] for _ in range(1): result = game_spec.play_game(make_move, functools.partial(min_max_move_func, depth=2), log=log_games) results.append(result) result = game_spec.play_game(functools.partial(min_max_move_func, depth=2), make_move, log=log_games) results.append(-result) print("*** results vs min max depth 2 = %s" % (sum(results), )) results = [] for _ in range(1): result = game_spec.play_game(make_move, functools.partial(min_max_move_func, depth=4), log=log_games) results.append(result) result = game_spec.play_game(functools.partial(min_max_move_func, depth=4), make_move, log=log_games) results.append(-result) print("*** results vs min max depth 4 = %s" % (sum(results), )) results = [] for _ in range(1): result = game_spec.play_game(make_move, functools.partial(min_max_move_func, depth=6), log=log_games) results.append(result) result = game_spec.play_game(functools.partial(min_max_move_func, depth=6), make_move, log=log_games) results.append(-result) print("*** results vs min max depth 6 = %s" % (sum(results), )) results = [] for _ in range(1): result = game_spec.play_game(make_move, functools.partial(min_max_move_func, depth=8), log=log_games) results.append(result) result = game_spec.play_game(functools.partial(min_max_move_func, make_move, depth=8), log=log_games) results.append(-result) print("*** results vs min max depth 8 = %s" % (sum(results), )) results = [] for _ in range(1): result = game_spec.play_game(make_move, monte_carlo_move_func, log=log_games) results.append(result) result = game_spec.play_game(monte_carlo_move_func, make_move, log=log_games) results.append(-result) print("*** results vs monte carlo uct 100000 = %s" % (sum(results), ))
value_input_layer, value_output_layer, value_variables = create_network( game_spec.board_squares(), HIDDEN_NODES_VALUE, output_nodes=1, output_softmax=False) target_placeholder = tf.placeholder("float", (None, 1)) error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) with tf.Session() as session: session.run(tf.initialize_all_variables()) load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH) if os.path.isfile(VALUE_NETWORK_PATH): print("loading previous version of value network") load_network(session, value_variables, VALUE_NETWORK_PATH) def make_move(board_state, side): move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer, board_state, side) return game_spec.flat_move_to_tuple(np.argmax(move)) board_states_training = {} board_states_test = []
def train_policy_gradients(game_spec, create_network, load_network_file_path, save_network_file_path=None, opponent_func=None, number_of_games=10000, print_results_every=1000, learn_rate=1e-4, batch_size=100, randomize_first_player=True, cnn_on=False, eps=0.1, deterministic=True, mcts=False, beta=0.01): """Train a network using policy gradients Args: save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then the network_file_path param is used. opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing randomly randomize_first_player (bool): If True we alternate between being the first and second player game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. load_network_file_path (str): path to the file with weights we want to load for this network number_of_games (int): number of games to play before stopping print_results_every (int): Prints results to std out every x games, also saves the network learn_rate (float): batch_size (int): cnn_on: if True, then the convolutional neural network is used Returns: (variables used in the final network : list, win rate: float) """ save_network_file_path = save_network_file_path or load_network_file_path # create folder if it does not exist if save_network_file_path: split = save_network_file_path.split('/') directory = '/'.join(split[:-1]) or '.' if not os.path.isdir(directory): os.makedirs(directory) print("created directory " + directory) if mcts: opponent_func = game_spec.get_monte_carlo_player_func( number_of_samples=27) else: opponent_func = opponent_func or game_spec.get_random_player_func() reward_placeholder = tf.placeholder("float", shape=(None, )) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs())) input_layer, output_layer, variables, weights = create_network() baseline = np.zeros([100, 1]) baselineCounter = 0 policy_gradient = tf.log( tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), axis=1)) * (reward_placeholder - np.mean(baseline)) #regularizer = sum([tf.nn.l2_loss(i) for i in weights]) train_step = tf.train.AdamOptimizer(learn_rate).minimize( -policy_gradient) # + beta * regularizer) #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient) with tf.Session() as session: session.run(tf.global_variables_initializer()) # load existing network and keep track of number of games played base_episode_number = 0 winrates = [] if load_network_file_path and os.path.isfile(load_network_file_path): print("loading pre-existing network") load_network(session, variables, load_network_file_path) base_episode_number, winrates = load_results( load_network_file_path) mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=print_results_every) def make_training_move(board_state, side): if cnn_on: # We must have the first 3x3 board as first 9 entries of the list, second 3x3 board as next 9 entries etc. # This is required for the CNN. The CNN takes the first 9 entries and forms a 3x3 board etc. """If the 10 split 3x3 boards are desired, use create_3x3_board_states(board_state) here""" np_board_state = create_3x3_board_states(board_state) else: np_board_state = np.array(board_state) np_board_state[np_board_state > 1] = 0 mini_batch_board_states.append( np_board_state * side ) # append all states are used in the minibatch (+ and - determine which player's state it was) rand_numb = random.uniform(0., 1.) if rand_numb < eps: move = get_random_network_move(board_state, game_spec) elif deterministic: move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec, cnn_on=cnn_on) else: if mcts: _, move = monte_carlo_tree_search(game_spec, board_state, side, 27, session, input_layer, output_layer, True, cnn_on, True) else: move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec, cnn_on=cnn_on) move_for_game = np.asarray( move ) # The move returned to the game is in a different configuration than the CNN learn move if cnn_on: # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state), # the same should happen for the mini batch moves move = create_3x3_board_states(np.reshape( move, [9, 9])) # The function requires a 9x9 array mini_batch_moves.append(move[0:81]) else: mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move_for_game.argmax()) for episode_number in range(1, number_of_games + 1): # randomize if going first or second if (not randomize_first_player) or bool(random.getrandbits(1)): reward = game_spec.play_game( make_training_move, opponent_func) # In this line one game is played. else: reward = -game_spec.play_game(opponent_func, make_training_move) results.append(reward) baseline[baselineCounter] = reward baselineCounter += 1 baselineCounter = baselineCounter % 100 # we scale here so winning quickly is better winning slowly and losing slowly better than losing quickly last_game_length = len(mini_batch_board_states) - len( mini_batch_rewards) reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) if episode_number % batch_size == 0: normalized_rewards = mini_batch_rewards - np.mean( mini_batch_rewards) rewards_std = np.std(normalized_rewards) if rewards_std != 0: normalized_rewards /= rewards_std else: print("warning: got mini batch std of 0.") np_mini_batch_board_states = np.array(mini_batch_board_states) \ .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) session.run(train_step, feed_dict={ input_layer: np_mini_batch_board_states, reward_placeholder: normalized_rewards, actual_move_placeholder: mini_batch_moves }) # clear batches del mini_batch_board_states[:] del mini_batch_moves[:] del mini_batch_rewards[:] if episode_number % print_results_every == 0: winrate = _win_rate(print_results_every, results) winrates.append( [base_episode_number + episode_number, winrate]) print("episode: %s win_rate: %s" % (base_episode_number + episode_number, winrate)) if save_network_file_path: save_network( session, variables, time.strftime(save_network_file_path[:-2] + "_ep" + str(base_episode_number + episode_number) + "_%Y-%m-%d_%H%M%S.p")) if save_network_file_path: save_network(session, variables, save_network_file_path) return variables, _win_rate(print_results_every, results), winrates
def train_policy_gradients(game_spec, create_network, network_file_path, save_network_file_path=None, opponent_func=None, number_of_games=10000, print_results_every=1000, learn_rate=1e-4, batch_size=100, randomize_first_player=True): """Train a network using policy gradients Args: save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then the network_file_path param is used. opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing randomly randomize_first_player (bool): If True we alternate between being the first and second player game_spec (games.base_game_spec.BaseGameSpec): The game we are playing create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): Method that creates the network we will train. network_file_path (str): path to the file with weights we want to load for this network number_of_games (int): number of games to play before stopping print_results_every (int): Prints results to std out every x games, also saves the network learn_rate (float): batch_size (int): Returns: (variables used in the final network : list, win rate: float) """ save_network_file_path = save_network_file_path or network_file_path opponent_func = opponent_func or game_spec.get_random_player_func() reward_placeholder = tf.placeholder("float", shape=(None,)) actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs())) input_layer, output_layer, variables = create_network() policy_gradient = tf.log( tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient) with tf.Session() as session: session.run(tf.initialize_all_variables()) if network_file_path and os.path.isfile(network_file_path): print("loading pre-existing network") load_network(session, variables, network_file_path) mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] results = collections.deque(maxlen=print_results_every) def make_training_move(board_state, side): mini_batch_board_states.append(np.ravel(board_state) * side) move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side) mini_batch_moves.append(move) return game_spec.flat_move_to_tuple(move.argmax()) for episode_number in range(1, number_of_games): # randomize if going first or second if (not randomize_first_player) or bool(random.getrandbits(1)): reward = game_spec.play_game(make_training_move, opponent_func) else: reward = -game_spec.play_game(opponent_func, make_training_move) results.append(reward) # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards) reward /= float(last_game_length) mini_batch_rewards += ([reward] * last_game_length) if episode_number % batch_size == 0: normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards) rewards_std = np.std(normalized_rewards) if rewards_std != 0: normalized_rewards /= rewards_std else: print("warning: got mini batch std of 0.") np_mini_batch_board_states = np.array(mini_batch_board_states) \ .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states, reward_placeholder: normalized_rewards, actual_move_placeholder: mini_batch_moves}) # clear batches del mini_batch_board_states[:] del mini_batch_moves[:] del mini_batch_rewards[:] if episode_number % print_results_every == 0: print("episode: %s win_rate: %s" % (episode_number, _win_rate(print_results_every, results))) if network_file_path: save_network(session, variables, save_network_file_path) if network_file_path: save_network(session, variables, save_network_file_path) return variables, _win_rate(print_results_every, results)
def test_game_network(self): network_file_path = r'C:\Users\User\APH\1B 2017 2018\Advanced Machine Learning\Resit\Git\QLUT\networks\cnn_50_50_50_e-3_stoch_mcts\\' n = 100 steps_games = 10 cnn_on = True mcts = True filter_shape = [3, 3] filter_depth = [10, 10, 10] dense_width = [] input_layer = 90 hidden_layers = [51, 51, 51] output_layer = 81 f = [] for (dirpath, dirnames, filenames) in os.walk(network_file_path): f.extend(filenames) break netlist_hist = [] raw_netlist = [] for file in f: p = re.compile('net_ep\d+_.+\.p') if 'config' in file: pass elif 'hist' in file: historic_net = file elif p.search(file) is None: netlist_hist.append(file) else: raw_netlist.append(file) nr_games = [] for i, name in enumerate(raw_netlist): nr_games.append((int(name[6:-20]), i)) nr_games.sort() netlist = [raw_netlist[i[1]] for i in nr_games] gamefiles = [ netlist[(i + 1) * steps_games - 1] for i in range(0, int(len(netlist) / steps_games)) ] network_games = [ nr_games[(i + 1) * steps_games - 1][0] for i in range(0, int(len(netlist) / steps_games)) ] print(gamefiles) print(gamefiles[0][1]) # if resultfile exists: # network_info = load_results(network_file_path, results_only = False) # input_layer = network_info['input_layer'] # hidden_layers = network_info['hidden_layers'] # output_layer = network_info['output_layer'] # otherwise look up values game_spec = ut.UltimateTicTacToeGameSpec() if mcts: opponent_func = game_spec.get_monte_carlo_player_func( number_of_samples=27) else: opponent_func = game_spec.get_random_player_func() # opponent_func = game_spec.get_manual_player_func() def player_func(board_state, side): if mcts: _, move = monte_carlo_tree_search(game_spec, board_state, side, 27, session, input_layer, output_layer, True, cnn_on, True) else: move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=True, game_spec=game_spec) move_for_game = np.asarray( move ) # The move returned to the game is in a different configuration than the CNN learn move return game_spec.flat_move_to_tuple(move_for_game.argmax()) if cnn_on: create_network_func = functools.partial(cnn.create_network, filter_shape, filter_depth, dense_width) else: create_network_func = functools.partial(create_network, input_layer, hidden_layers, output_layer) input_layer, output_layer, variables, _ = create_network_func() results = {} for i in range(len(gamefiles)): t = time.perf_counter() with tf.Session() as session: session.run(tf.global_variables_initializer()) print("loading pre-existing network") load_network(session, variables, network_file_path + gamefiles[i]) #\\ results_X = [ game_spec.play_game(player_func, opponent_func) for _ in range(int(n / 2)) ] results_O = [ game_spec.play_game(opponent_func, player_func) for _ in range(int(n / 2)) ] elapsed_time = time.perf_counter() - t results[network_games[i]] = results_X + [-j for j in results_O] print('network ' + gamefiles[i]) """ print('Elapsed time:', elapsed_time) print('Network as X (%s games):' % (n/2)) print('Player X wins: ', results_X.count(1)/n*2) print('Player O wins: ', results_X.count(-1)/n*2) print('Draws : ', results_X.count(0)/n*2) print('Winrate X : ', 0.5 + 1.*sum(results_X)/n*2) print('Network as O (%s games):' % (n/2)) print('Player O wins: ', results_O.count(-1)/n*2) print('Player X wins: ', results_O.count(1)/n*2) print('Draws : ', results_O.count(0)/n*2) print('Winrate O : ', 0.5 - 1.*sum(results_O)/n*2) """ with open(network_file_path + '_benchmark_vs_rand.json', 'w') as outfile: json.dump(results, outfile)
print(netlist) opponent_index = 0 historical_input_layer, historical_output_layer, historical_variables, _ = create_network( ) networks = [] for _ in netlist: net_input_layer, net_output_layer, net_variables, _ = create_network() networks.append((net_input_layer, net_output_layer, net_variables)) with tf.Session() as session: session.run(tf.global_variables_initializer()) print('location: ' + netloc) load_network(session, historical_variables, netloc + historic_net) print('loaded historic net: ' + historic_net) def make_move_historical(net, board_state, side): if mcts: _, move = monte_carlo_tree_search(game_spec, board_state, side, 27, session, input_layer, output_layer, True, cnn_on, True) else: # move = get_deterministic_network_move(session, net[0], net[1], board_state, side, # valid_only = True, game_spec = game_spec, cnn_on = cnn_on) move = get_stochastic_network_move(session, net[0], net[1], board_state, side,
import collections