Exemple #1
0
def load_network_player(network_filename, hidden_layers):
    session = tf.Session()
    input_layer, output_layer, variables = network_helpers.create_network(
        game_spec.board_squares(), hidden_layers)
    network_helpers.load_network(session, variables, network_filename)

    def network_player(board_state, side):
        print
        print "Network player (%s)" % side
        tic_tac_toe.print_game_state(board_state)

        move_probs = network_helpers.get_stochastic_network_move(
            session, input_layer, output_layer, board_state, side, log=True)
        move = game_spec.flat_move_to_tuple(move_probs.argmax())

        print "Network move:", move
        return move
    return network_player
Exemple #2
0
    def test_load_variables_into_network_of_wrong_size_gives_friendly_exception(
            self):
        try:
            file_name = 'test.p'
            input_nodes = 20

            _, _, variables1 = create_network(input_nodes, (30, ))
            _, _, variables2 = create_network(input_nodes, (40, ))

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())

                save_network(session, variables1, file_name)

                with self.assertRaises(ValueError):
                    load_network(session, variables2, file_name)
        finally:
            try:
                os.remove(file_name)
            except OSError:
                pass
Exemple #3
0
    def test_save_and_load_network(self):
        try:
            file_name = 'test.p'
            input_nodes = 20
            hidden_nodes = (50, 40, 30)
            _, _, variables1 = create_network(input_nodes, hidden_nodes)
            _, _, variables2 = create_network(input_nodes, hidden_nodes)

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())

                save_network(session, variables1, file_name)
                load_network(session, variables2, file_name)

                for var1, var2 in zip(variables1, variables2):
                    np.testing.assert_array_almost_equal(
                        session.run(var1), session.run(var2))
        finally:
            try:
                os.remove(file_name)
            except OSError:
                pass
def predict_best_move_low_level(game_spec, create_network, network_file_path,
                                player, board_state):
    """Make a predicition for the next move at a given state using some lower level parameters

    Args:
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        player: The player to make the move 1 or -1
        board_state: The state of the board at some time during the game

    Returns:
        a vector of zeros with a 1 on the position which represents the best move to be taken
    """
    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables = create_network()

    policy_gradient = tf.log(
        tf.reduce_sum(tf.mul(actual_move_placeholder, output_layer),
                      reduction_indices=1)) * reward_placeholder

    with tf.Session() as session:
        session.run(tf.initialize_all_variables())

        if network_file_path and os.path.isfile(network_file_path):
            print("Loading trained network from ", network_file_path)
            load_network(session, variables, network_file_path)
        else:
            print("File with trained network can't be loaded. Exiting...'")
            return

        return get_stochastic_network_move(session, input_layer, output_layer,
                                           board_state, player)
Exemple #5
0
def train_supervised(game_spec,
                     create_network,
                     network_file_path,
                     positions,
                     test_set_ratio=0.4,
                     regularization_coefficent=1e-5,
                     batch_size=100,
                     learn_rate=1e-4,
                     stop_turns_without_improvement=7):
    """Train a network using supervised learning using against a list of game positions and moves chosen.
    We stop after we have had stop_turns_without_improvement without an improvement in the test error.
    The test set is used as a validation set as well, will possibly improve this in the future to have a seperate test
     and validation set.

    Args:
        stop_turns_without_improvement (int): we stop training after this many iterations without any improvement in
            the test error.
        regularization_coefficent (float): amount to multiply the l2 regularizer by in the loss function
        test_set_ratio (float): portion of the data to divide into the test set,
        positions ([(board_state, move)]): list of tuples of board states and the moves chosen in those board_states
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        learn_rate (float):
        batch_size (int):

    Returns:
        episode_number, train_error, train_accuracy, new_test_error, test_accuracy
    """
    input_layer, output_layer, variables = create_network()

    test_set_count = int(len(positions) * test_set_ratio)
    train_set = positions[:-test_set_count]
    test_set = positions[-test_set_count:]

    actual_move_placeholder = tf.compat.v1.placeholder(
        "float", (None, game_spec.outputs()))

    error = tf.reduce_sum(input_tensor=tf.square(actual_move_placeholder -
                                                 output_layer))

    regularizer = None
    for var in variables:
        if regularizer is None:
            regularizer = tf.nn.l2_loss(var)
        else:
            regularizer += tf.nn.l2_loss(var)

    loss = error + regularizer * regularization_coefficent

    train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize(loss)

    correct_pred = tf.equal(tf.argmax(input=output_layer, axis=1),
                            tf.argmax(input=actual_move_placeholder, axis=1))
    accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_pred, tf.float32))

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())

        if os.path.isfile(network_file_path):
            print("loading existing network")
            load_network(session, variables, network_file_path)

        episode_number = 1
        turns_without_test_improvement = 0

        best_test_error, test_accuracy = session.run(
            [error, accuracy],
            feed_dict={
                input_layer: [x[0] for x in test_set],
                actual_move_placeholder: [x[1] for x in test_set]
            })

        while True:
            random.shuffle(train_set)
            train_error = 0

            for start_index in range(0,
                                     len(train_set) - batch_size + 1,
                                     batch_size):
                mini_batch = train_set[start_index:start_index + batch_size]

                batch_error, _ = session.run(
                    [error, train_step],
                    feed_dict={
                        input_layer: [x[0] for x in mini_batch],
                        actual_move_placeholder: [x[1] for x in mini_batch]
                    })
                train_error += batch_error

            new_test_error, test_accuracy = session.run(
                [error, accuracy],
                feed_dict={
                    input_layer: [x[0] for x in test_set],
                    actual_move_placeholder: [x[1] for x in test_set]
                })

            print("episode: %s train_error: %s test_error: %s test_acc: %s" %
                  (episode_number, train_error, new_test_error, test_accuracy))

            if new_test_error < best_test_error:
                best_test_error = new_test_error
                turns_without_test_improvement = 0
            else:
                turns_without_test_improvement += 1
                if turns_without_test_improvement > stop_turns_without_improvement:
                    train_accuracy = session.run(
                        [accuracy],
                        feed_dict={
                            input_layer: [x[0] for x in train_set],
                            actual_move_placeholder: [x[1] for x in train_set]
                        })

                    print(
                        "test error not improving for %s turns, ending training"
                        % (stop_turns_without_improvement, ))
                    break

            episode_number += 1

        print(
            "final episode: %s train_error: %s train acc: %s test_error: %s test_acc: %s"
            % (episode_number, train_error, train_accuracy, new_test_error,
               test_accuracy))

        save_network(session, variables, network_file_path)

    return episode_number, train_error, train_accuracy, new_test_error, test_accuracy
def train_policy_gradients_vs_historic(
        game_spec,
        create_network,
        network_file_path,
        save_network_file_path=None,
        number_of_historic_networks=8,
        save_historic_every=10000,
        historic_network_base_path='historic_network',
        number_of_games=100000,
        print_results_every=1000,
        learn_rate=1e-4,
        batch_size=100):
    """Train a network against itself and over time store new version of itself to play against.

    Args:
        historic_network_base_path (str): Bast path to save new historic networks to a number for the network "slot" is
            appended to the end of this string.
        save_historic_every (int): We save a version of the learning network into one of the historic network
            "slots" every x number of games. We have number_of_historic_networks "slots"
        number_of_historic_networks (int): We keep this many old networks to play against
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):

    Returns:
        [tf.Vaiables] : trained variables used in the final network
    """
    input_layer, output_layer, variables = create_network()

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None,
                                                    game_spec.board_squares()))
    policy_gradient = tf.reduce_sum(
        tf.reshape(reward_placeholder,
                   (-1, 1)) * actual_move_placeholder * output_layer)
    train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(
        -policy_gradient)

    current_historical_index = 0
    historical_networks = []

    mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
    results = collections.deque(maxlen=print_results_every)

    for _ in range(number_of_historic_networks):
        historical_input_layer, historical_output_layer, historical_variables = create_network(
        )
        historical_networks.append(
            (historical_input_layer, historical_output_layer,
             historical_variables))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        def make_move_historical(histoical_network_index, board_state, side):
            net = historical_networks[histoical_network_index]
            move = get_stochastic_network_move(session,
                                               net[0],
                                               net[1],
                                               board_state,
                                               side,
                                               valid_only=True,
                                               game_spec=game_spec)
            return game_spec.flat_move_to_tuple(move.argmax())

        def make_training_move(board_state, side):
            mini_batch_board_states.append(np.ravel(board_state) * side)
            move = get_stochastic_network_move(session,
                                               input_layer,
                                               output_layer,
                                               board_state,
                                               side,
                                               valid_only=True,
                                               game_spec=game_spec)
            mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move.argmax())

        if os.path.isfile(network_file_path):
            print("loading pre existing weights")
            load_network(session, variables, network_file_path)
        else:
            print("could not find previous weights so initialising randomly")

        for i in range(number_of_historic_networks):
            if os.path.isfile(historic_network_base_path + str(i) + '.p'):
                load_network(session, historical_networks[i][2],
                             historic_network_base_path + str(i) + '.p')
            elif os.path.isfile(network_file_path):
                # if we can't load a historical file use the current network weights
                load_network(session, historical_networks[i][2],
                             network_file_path)

        for episode_number in range(1, number_of_games):
            opponent_index = random.randint(0, number_of_historic_networks - 1)
            make_move_historical_for_index = functools.partial(
                make_move_historical, opponent_index)

            # randomize if going first or second
            if bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move,
                                             make_move_historical_for_index)
            else:
                reward = -game_spec.play_game(make_move_historical_for_index,
                                              make_training_move)

            results.append(reward)

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            episode_number += 1

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)
                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                print("episode: %s average result: %s" %
                      (episode_number, np.mean(results)))

            if episode_number % save_historic_every == 0:
                print("saving historical network %s", current_historical_index)
                save_network(
                    session, variables, historic_network_base_path +
                    str(current_historical_index) + '.p')
                load_network(
                    session, historical_networks[current_historical_index][2],
                    historic_network_base_path +
                    str(current_historical_index) + '.p')

                # also save to the main network file
                save_network(session, variables, save_network_file_path
                             or network_file_path)

                current_historical_index += 1
                current_historical_index %= number_of_historic_networks

        # save our final weights
        save_network(session, variables, save_network_file_path
                     or network_file_path)

    return variables
def train_value_network(game_spec,
                        hidden_nodes_reinforcement,
                        reinforcement_network_file_path,
                        hidden_nodes_value,
                        value_network_file_path,
                        learn_rate=1e-4,
                        batch_size=100,
                        train_samples=10000,
                        test_samples=8000):
    reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
        game_spec.board_squares(), hidden_nodes_reinforcement,
        game_spec.outputs())

    value_input_layer, value_output_layer, value_variables = create_network(
        game_spec.board_squares(),
        hidden_nodes_value,
        output_nodes=1,
        output_softmax=False)

    target_placeholder = tf.compat.v1.placeholder("float", (None, 1))
    error = tf.reduce_sum(input_tensor=tf.square(target_placeholder -
                                                 value_output_layer))

    train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize(
        error)

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())

        load_network(session, reinforcement_variables,
                     reinforcement_network_file_path)

        if os.path.isfile(value_network_file_path):
            print("loading previous version of value network")
            load_network(session, value_variables, value_network_file_path)

        def make_move(board_state, side):
            move = get_deterministic_network_move(session,
                                                  reinforcement_input_layer,
                                                  reinforcement_output_layer,
                                                  board_state, side)

            return game_spec.flat_move_to_tuple(np.argmax(move))

        board_states_training = {}
        board_states_test = []
        episode_number = 0

        while len(board_states_training) < train_samples + test_samples:
            board_state = _generate_random_board_position(
                game_spec, (1, game_spec.board_squares() * 0.8))
            board_state_flat = tuple(np.ravel(board_state))

            # only accept the board_state if not already in the dict
            if board_state_flat not in board_states_training:
                result = game_spec.play_game(make_move,
                                             make_move,
                                             board_state=board_state)
                board_states_training[board_state_flat] = float(result)

        # take a random selection from training into a test set
        for _ in range(test_samples):
            sample = random.choice(board_states_training.keys())
            board_states_test.append((sample, board_states_training[sample]))
            del board_states_training[sample]

        board_states_training = list(board_states_training.iteritems())

        test_error = session.run(error,
                                 feed_dict={
                                     value_input_layer:
                                     [x[0] for x in board_states_test],
                                     target_placeholder:
                                     [[x[1]] for x in board_states_test]
                                 })

        while True:
            np.random.shuffle(board_states_training)
            train_error = 0

            for start_index in range(
                    0,
                    len(board_states_training) - batch_size + 1, batch_size):
                mini_batch = board_states_training[start_index:start_index +
                                                   batch_size]

                batch_error, _ = session.run(
                    [error, train_step],
                    feed_dict={
                        value_input_layer: [x[0] for x in mini_batch],
                        target_placeholder: [[x[1]] for x in mini_batch]
                    })
                train_error += batch_error

            new_test_error = session.run(error,
                                         feed_dict={
                                             value_input_layer:
                                             [x[0] for x in board_states_test],
                                             target_placeholder:
                                             [[x[1]]
                                              for x in board_states_test]
                                         })

            print("episode: %s train_error: %s test_error: %s" %
                  (episode_number, train_error, test_error))

            if new_test_error > test_error:
                print("train error went up, stopping training")
                break

            test_error = new_test_error
            episode_number += 1

        save_network(session, value_variables, value_network_file_path)
def train_policy_gradients_vs_historic(
        game_spec,
        create_network,
        load_network_file_path,
        save_network_file_path=None,
        number_of_historic_networks=1,
        historic_network_base_path='historic_network',
        number_of_games=10000,
        update_opponent_winrate=0.65,
        print_results_every=100,
        learn_rate=1e-3,
        batch_size=100,
        cnn_on=False,
        eps=0.1,
        deterministic=True,
        mcts=False,
        min_win_ticks=3,
        beta=0.01):
    """Train a network against itself and over time store new version of itself to play against.

    Args:
        historic_network_base_path (str): Base path to save new historic networks to a number for the network "slot" is
            appended to the end of this string.
        save_historic_every (int): We save a version of the learning network into one of the historic network
            "slots" every x number of games. We have number_of_historic_networks "slots"
        number_of_historic_networks (int): We keep this many old networks to play against
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the load_network_file_path param is used.
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        load_network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        update_opponent_winrate (float): the required winrate before updating the opponent to the newer agent
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):
        cnn_on (bool): use convolutional or regular neural network
        eps (float): fraction of moves made randomly
        deterministic (bool): use deterministic or stochastic move selection
        min_win_ticks (int): number of times the networks winrate needs to exceed update_opponent_winrate to update

    Returns:
        [tf.Variables] : trained variables used in the final network
    """
    save_network_file_path = save_network_file_path or load_network_file_path
    # create folder if it does not exist
    if save_network_file_path:
        split = save_network_file_path.split('/')
        directory = '/'.join(split[:-1]) or '.'
        if not os.path.isdir(directory):
            os.makedirs(directory)
            print("created directory " + directory)

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables, weights = create_network()

    baseline = np.zeros([100, 1])
    baselineCounter = 0
    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer),
                      axis=1)) * (reward_placeholder - np.mean(baseline))
    #policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) #Original one from historic
    #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient) # Why is this one different from the other train policy grad?

    regularizer = sum([tf.nn.l2_loss(i) for i in weights])
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient +
                                                             beta *
                                                             regularizer)

    current_historical_index = 0  # We will (probably) not use this: we always train against the most recent agent
    historical_networks = []

    mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
    results = collections.deque(maxlen=print_results_every)

    for _ in range(number_of_historic_networks):
        historical_input_layer, historical_output_layer, historical_variables, _ = create_network(
        )
        historical_networks.append(
            (historical_input_layer, historical_output_layer,
             historical_variables))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        #testvar = variables[0] #\\ test for checking variables
        #print(session.run(variables[0]))
        base_episode_number = 0
        winrates = []

        if load_network_file_path and os.path.isfile(load_network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, load_network_file_path)
            base_episode_number, winrates = load_results(
                load_network_file_path)
        else:
            print('Creating new network')

        def make_move_historical(historical_network_index, board_state, side):
            net = historical_networks[historical_network_index]
            #move = get_stochastic_network_move(session, net[0], net[1], board_state, side,
            #                                  valid_only=True, game_spec=game_spec, CNN_ON=cnn_on)
            if mcts:
                _, move = monte_carlo_tree_search(game_spec, board_state, side,
                                                  27, session, input_layer,
                                                  output_layer, True, cnn_on,
                                                  True)
            else:
                move = get_deterministic_network_move(session,
                                                      net[0],
                                                      net[1],
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)

            move_for_game = np.asarray(
                move)  # move must be an array, mcts doesn't return this
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        def make_training_move(board_state, side):
            if cnn_on:
                np_board_state = create_3x3_board_states(board_state)
            else:
                np_board_state = np.array(board_state)

            mini_batch_board_states.append(np_board_state * side)

            rand_numb = random.uniform(0., 1.)
            if rand_numb < eps:
                move = get_random_network_move(board_state, game_spec)
            elif deterministic:
                move = get_deterministic_network_move(session,
                                                      input_layer,
                                                      output_layer,
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)
            else:
                if mcts:
                    _, move = monte_carlo_tree_search(game_spec, board_state,
                                                      side, 27, session,
                                                      input_layer,
                                                      output_layer, True,
                                                      cnn_on, True)
                else:
                    move = get_stochastic_network_move(session,
                                                       input_layer,
                                                       output_layer,
                                                       board_state,
                                                       side,
                                                       valid_only=True,
                                                       game_spec=game_spec,
                                                       cnn_on=cnn_on)

            move_for_game = np.asarray(
                move
            )  # The move returned to the game is in a different configuration than the CNN learn move
            if cnn_on:
                # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state),
                # the same should happen for the mini batch moves
                move = create_3x3_board_states(np.reshape(
                    move, [9, 9]))  # The function requires a 9x9 array
                mini_batch_moves.append(move[0:81])
            else:
                mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        #for i in range(number_of_historic_networks):
        if os.path.isfile(historic_network_base_path + str(0) + '.p'):
            load_network(session, historical_networks[0][2],
                         historic_network_base_path + str(0) + '.p')
            print('Historic network loaded')
        else:
            # if we can't load a historical file use the current network weights
            print(
                'Warning: loading historical file failed. Current net is saved and being used as historic net.'
            )
            historic_filename = historic_network_base_path + str(
                current_historical_index) + '.p'
            save_network(session, variables, historic_filename)
            load_network(session,
                         historical_networks[current_historical_index][2],
                         historic_filename)

        win_ticks = 0  # registers the amount of times the agent has a high enough winrate to update its opponent
        for episode_number in range(1, number_of_games):
            opponent_index = random.randint(0, number_of_historic_networks - 1)
            make_move_historical_for_index = functools.partial(
                make_move_historical, opponent_index)

            # randomize if going first or second
            if bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move,
                                             make_move_historical_for_index)
            else:
                reward = -game_spec.play_game(make_move_historical_for_index,
                                              make_training_move)

            results.append(reward)
            baseline[baselineCounter] = reward
            baselineCounter += 1
            baselineCounter = baselineCounter % 100

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)
            reward /= float(last_game_length)
            mini_batch_rewards += ([reward] * last_game_length)
            episode_number += 1

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                winrate = _win_rate(print_results_every, results)
                if winrate == 0:
                    print('DEBUG TEST')
                winrates.append(
                    [base_episode_number + episode_number, winrate])
                print("episode: %s win_rate: %s" %
                      (base_episode_number + episode_number, winrate))
                if save_network_file_path:
                    save_network(
                        session, variables,
                        time.strftime(save_network_file_path[:-2] + "_ep" +
                                      str(base_episode_number +
                                          episode_number) +
                                      "_%Y-%m-%d_%H%M%S.p"))

            # Update opponent when winrate is high enough and it happens for a longer period
            if (episode_number % print_results_every
                    == 0) and (winrate >= update_opponent_winrate):
                win_ticks += 1
                if win_ticks >= min_win_ticks:
                    win_ticks = 0
                    first_bot = False
                    print("saving historical network %s at episode %s." %
                          (current_historical_index,
                           base_episode_number + episode_number)
                          )  # Overwrite historic opponent with current network
                    historic_filename = historic_network_base_path + str(
                        current_historical_index) + '.p'
                    save_network(session, variables, historic_filename)
                    load_network(
                        session,
                        historical_networks[current_historical_index][2],
                        historic_filename)

                    # also save to the main network file
                    save_network(session, variables,
                                 (save_network_file_path
                                  or load_network_file_path)[:-2] + "_ep" +
                                 str(base_episode_number + episode_number) +
                                 ".p")

                    current_historical_index += 1  # Not used when we only have 1 historic network
                    current_historical_index %= number_of_historic_networks

        # save our final weights
        save_network(session, variables, save_network_file_path
                     or load_network_file_path)

    return variables, _win_rate(print_results_every, results), winrates
Exemple #9
0
def benchmark(game_spec,
              network_file_path,
              create_network_func,
              log_games=False,
              games_vs_random=500):
    """Plays games against a variety of algorithms to see how good a network is. Results are currently just
    printed to std out

    Args:
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network_func (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        log_games (bool): If True print all positions from all games played
        games_vs_random (int): Number of games to play vs random opponents
    """
    input_layer, output_layer, variables = create_network_func()

    with tf.Session() as session:
        session.run(tf.initialize_all_variables())
        load_network(session, variables, network_file_path)

        def make_move(board_state, side):
            move = get_deterministic_network_move(session,
                                                  input_layer,
                                                  output_layer,
                                                  board_state,
                                                  side,
                                                  valid_only=True,
                                                  game_spec=game_spec)
            return game_spec.flat_move_to_tuple(move.argmax())

        def min_max_move_func(board_state, side, depth):
            return min_max_alpha_beta(game_spec, board_state, side, depth)[1]

        def monte_carlo_move_func(board_state, side):
            return monte_carlo_tree_search_uct(game_spec, board_state, side,
                                               100000)[1]

        results = []
        for _ in range(int(games_vs_random / 2)):
            result = game_spec.play_game(make_move,
                                         game_spec.get_random_player_func(),
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(game_spec.get_random_player_func(),
                                         make_move,
                                         log=log_games)
            results.append(-result)

        print("*** results vs random = %s" % (sum(results), ))

        results = []
        for _ in range(1):
            result = game_spec.play_game(make_move,
                                         functools.partial(min_max_move_func,
                                                           depth=2),
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(functools.partial(min_max_move_func,
                                                           depth=2),
                                         make_move,
                                         log=log_games)
            results.append(-result)

        print("*** results vs min max depth 2 = %s" % (sum(results), ))

        results = []
        for _ in range(1):
            result = game_spec.play_game(make_move,
                                         functools.partial(min_max_move_func,
                                                           depth=4),
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(functools.partial(min_max_move_func,
                                                           depth=4),
                                         make_move,
                                         log=log_games)
            results.append(-result)

        print("*** results vs min max depth 4 = %s" % (sum(results), ))

        results = []
        for _ in range(1):
            result = game_spec.play_game(make_move,
                                         functools.partial(min_max_move_func,
                                                           depth=6),
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(functools.partial(min_max_move_func,
                                                           depth=6),
                                         make_move,
                                         log=log_games)
            results.append(-result)

        print("*** results vs min max depth 6 = %s" % (sum(results), ))

        results = []
        for _ in range(1):
            result = game_spec.play_game(make_move,
                                         functools.partial(min_max_move_func,
                                                           depth=8),
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(functools.partial(min_max_move_func,
                                                           make_move,
                                                           depth=8),
                                         log=log_games)
            results.append(-result)

        print("*** results vs min max depth 8 = %s" % (sum(results), ))

        results = []
        for _ in range(1):
            result = game_spec.play_game(make_move,
                                         monte_carlo_move_func,
                                         log=log_games)
            results.append(result)
            result = game_spec.play_game(monte_carlo_move_func,
                                         make_move,
                                         log=log_games)
            results.append(-result)

        print("*** results vs monte carlo uct 100000 = %s" % (sum(results), ))
Exemple #10
0
value_input_layer, value_output_layer, value_variables = create_network(
    game_spec.board_squares(),
    HIDDEN_NODES_VALUE,
    output_nodes=1,
    output_softmax=False)

target_placeholder = tf.placeholder("float", (None, 1))
error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))

train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)

with tf.Session() as session:
    session.run(tf.initialize_all_variables())

    load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH)

    if os.path.isfile(VALUE_NETWORK_PATH):
        print("loading previous version of value network")
        load_network(session, value_variables, VALUE_NETWORK_PATH)

    def make_move(board_state, side):
        move = get_deterministic_network_move(session,
                                              reinforcement_input_layer,
                                              reinforcement_output_layer,
                                              board_state, side)

        return game_spec.flat_move_to_tuple(np.argmax(move))

    board_states_training = {}
    board_states_test = []
def train_policy_gradients(game_spec,
                           create_network,
                           load_network_file_path,
                           save_network_file_path=None,
                           opponent_func=None,
                           number_of_games=10000,
                           print_results_every=1000,
                           learn_rate=1e-4,
                           batch_size=100,
                           randomize_first_player=True,
                           cnn_on=False,
                           eps=0.1,
                           deterministic=True,
                           mcts=False,
                           beta=0.01):
    """Train a network using policy gradients

    Args:
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing
            randomly
        randomize_first_player (bool): If True we alternate between being the first and second player
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        load_network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):
        cnn_on: if True, then the convolutional neural network is used

    Returns:
        (variables used in the final network : list, win rate: float)
    """

    save_network_file_path = save_network_file_path or load_network_file_path
    # create folder if it does not exist
    if save_network_file_path:
        split = save_network_file_path.split('/')
        directory = '/'.join(split[:-1]) or '.'
        if not os.path.isdir(directory):
            os.makedirs(directory)
            print("created directory " + directory)

    if mcts:
        opponent_func = game_spec.get_monte_carlo_player_func(
            number_of_samples=27)
    else:
        opponent_func = opponent_func or game_spec.get_random_player_func()

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables, weights = create_network()
    baseline = np.zeros([100, 1])
    baselineCounter = 0
    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer),
                      axis=1)) * (reward_placeholder - np.mean(baseline))

    #regularizer = sum([tf.nn.l2_loss(i) for i in weights])
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(
        -policy_gradient)  # + beta * regularizer)
    #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        # load existing network and keep track of number of games played
        base_episode_number = 0
        winrates = []
        if load_network_file_path and os.path.isfile(load_network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, load_network_file_path)
            base_episode_number, winrates = load_results(
                load_network_file_path)

        mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
        results = collections.deque(maxlen=print_results_every)

        def make_training_move(board_state, side):
            if cnn_on:
                # We must have the first 3x3 board as first 9 entries of the list, second 3x3 board as next 9 entries etc.
                # This is required for the CNN. The CNN takes the first 9 entries and forms a 3x3 board etc.
                """If the 10 split 3x3 boards are desired, use create_3x3_board_states(board_state) here"""
                np_board_state = create_3x3_board_states(board_state)
            else:
                np_board_state = np.array(board_state)
            np_board_state[np_board_state > 1] = 0
            mini_batch_board_states.append(
                np_board_state * side
            )  # append all states are used in the minibatch (+ and - determine which player's state it was)

            rand_numb = random.uniform(0., 1.)
            if rand_numb < eps:
                move = get_random_network_move(board_state, game_spec)
            elif deterministic:
                move = get_deterministic_network_move(session,
                                                      input_layer,
                                                      output_layer,
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)
            else:
                if mcts:
                    _, move = monte_carlo_tree_search(game_spec, board_state,
                                                      side, 27, session,
                                                      input_layer,
                                                      output_layer, True,
                                                      cnn_on, True)
                else:
                    move = get_stochastic_network_move(session,
                                                       input_layer,
                                                       output_layer,
                                                       board_state,
                                                       side,
                                                       valid_only=True,
                                                       game_spec=game_spec,
                                                       cnn_on=cnn_on)
            move_for_game = np.asarray(
                move
            )  # The move returned to the game is in a different configuration than the CNN learn move
            if cnn_on:
                # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state),
                # the same should happen for the mini batch moves
                move = create_3x3_board_states(np.reshape(
                    move, [9, 9]))  # The function requires a 9x9 array
                mini_batch_moves.append(move[0:81])
            else:
                mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        for episode_number in range(1, number_of_games + 1):
            # randomize if going first or second
            if (not randomize_first_player) or bool(random.getrandbits(1)):
                reward = game_spec.play_game(
                    make_training_move,
                    opponent_func)  # In this line one game is played.
            else:
                reward = -game_spec.play_game(opponent_func,
                                              make_training_move)

            results.append(reward)
            baseline[baselineCounter] = reward
            baselineCounter += 1
            baselineCounter = baselineCounter % 100
            # we scale here so winning quickly is better winning slowly and losing slowly better than losing quickly
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                winrate = _win_rate(print_results_every, results)
                winrates.append(
                    [base_episode_number + episode_number, winrate])
                print("episode: %s win_rate: %s" %
                      (base_episode_number + episode_number, winrate))
                if save_network_file_path:
                    save_network(
                        session, variables,
                        time.strftime(save_network_file_path[:-2] + "_ep" +
                                      str(base_episode_number +
                                          episode_number) +
                                      "_%Y-%m-%d_%H%M%S.p"))

        if save_network_file_path:
            save_network(session, variables, save_network_file_path)

    return variables, _win_rate(print_results_every, results), winrates
Exemple #12
0
def train_policy_gradients(game_spec,
                           create_network,
                           network_file_path,
                           save_network_file_path=None,
                           opponent_func=None,
                           number_of_games=10000,
                           print_results_every=1000,
                           learn_rate=1e-4,
                           batch_size=100,
                           randomize_first_player=True):
    """Train a network using policy gradients

    Args:
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing
            randomly
        randomize_first_player (bool): If True we alternate between being the first and second player
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):

    Returns:
        (variables used in the final network : list, win rate: float)
    """
    save_network_file_path = save_network_file_path or network_file_path
    opponent_func = opponent_func or game_spec.get_random_player_func()
    reward_placeholder = tf.placeholder("float", shape=(None,))
    actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables = create_network()

    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient)

    with tf.Session() as session:
        session.run(tf.initialize_all_variables())

        if network_file_path and os.path.isfile(network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, network_file_path)

        mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
        results = collections.deque(maxlen=print_results_every)

        def make_training_move(board_state, side):
            mini_batch_board_states.append(np.ravel(board_state) * side)
            move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side)
            mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move.argmax())

        for episode_number in range(1, number_of_games):
            # randomize if going first or second
            if (not randomize_first_player) or bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move, opponent_func)
            else:
                reward = -game_spec.play_game(opponent_func, make_training_move)

            results.append(reward)

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states,
                                                   reward_placeholder: normalized_rewards,
                                                   actual_move_placeholder: mini_batch_moves})

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                print("episode: %s win_rate: %s" % (episode_number, _win_rate(print_results_every, results)))
                if network_file_path:
                    save_network(session, variables, save_network_file_path)

        if network_file_path:
            save_network(session, variables, save_network_file_path)

    return variables, _win_rate(print_results_every, results)
Exemple #13
0
    def test_game_network(self):

        network_file_path = r'C:\Users\User\APH\1B 2017 2018\Advanced Machine Learning\Resit\Git\QLUT\networks\cnn_50_50_50_e-3_stoch_mcts\\'
        n = 100
        steps_games = 10
        cnn_on = True
        mcts = True
        filter_shape = [3, 3]
        filter_depth = [10, 10, 10]
        dense_width = []
        input_layer = 90
        hidden_layers = [51, 51, 51]
        output_layer = 81

        f = []
        for (dirpath, dirnames, filenames) in os.walk(network_file_path):
            f.extend(filenames)
            break

        netlist_hist = []
        raw_netlist = []
        for file in f:
            p = re.compile('net_ep\d+_.+\.p')
            if 'config' in file:
                pass
            elif 'hist' in file:
                historic_net = file
            elif p.search(file) is None:
                netlist_hist.append(file)
            else:
                raw_netlist.append(file)

        nr_games = []
        for i, name in enumerate(raw_netlist):
            nr_games.append((int(name[6:-20]), i))
        nr_games.sort()

        netlist = [raw_netlist[i[1]] for i in nr_games]
        gamefiles = [
            netlist[(i + 1) * steps_games - 1]
            for i in range(0, int(len(netlist) / steps_games))
        ]
        network_games = [
            nr_games[(i + 1) * steps_games - 1][0]
            for i in range(0, int(len(netlist) / steps_games))
        ]
        print(gamefiles)
        print(gamefiles[0][1])

        # if resultfile exists:
        # network_info = load_results(network_file_path, results_only = False)
        # input_layer = network_info['input_layer']
        # hidden_layers = network_info['hidden_layers']
        # output_layer = network_info['output_layer']
        # otherwise look up values

        game_spec = ut.UltimateTicTacToeGameSpec()

        if mcts:
            opponent_func = game_spec.get_monte_carlo_player_func(
                number_of_samples=27)
        else:
            opponent_func = game_spec.get_random_player_func()

        # opponent_func = game_spec.get_manual_player_func()

        def player_func(board_state, side):
            if mcts:
                _, move = monte_carlo_tree_search(game_spec, board_state, side,
                                                  27, session, input_layer,
                                                  output_layer, True, cnn_on,
                                                  True)
            else:
                move = get_deterministic_network_move(session,
                                                      input_layer,
                                                      output_layer,
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec)

            move_for_game = np.asarray(
                move
            )  # The move returned to the game is in a different configuration than the CNN learn move
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        if cnn_on:
            create_network_func = functools.partial(cnn.create_network,
                                                    filter_shape, filter_depth,
                                                    dense_width)
        else:
            create_network_func = functools.partial(create_network,
                                                    input_layer, hidden_layers,
                                                    output_layer)

        input_layer, output_layer, variables, _ = create_network_func()
        results = {}
        for i in range(len(gamefiles)):
            t = time.perf_counter()
            with tf.Session() as session:
                session.run(tf.global_variables_initializer())
                print("loading pre-existing network")
                load_network(session, variables,
                             network_file_path + gamefiles[i])  #\\

                results_X = [
                    game_spec.play_game(player_func, opponent_func)
                    for _ in range(int(n / 2))
                ]
                results_O = [
                    game_spec.play_game(opponent_func, player_func)
                    for _ in range(int(n / 2))
                ]
                elapsed_time = time.perf_counter() - t

                results[network_games[i]] = results_X + [-j for j in results_O]
                print('network ' + gamefiles[i])
                """
				print('Elapsed time:', elapsed_time)
				print('Network as X (%s games):' % (n/2))
				print('Player X wins: ', results_X.count(1)/n*2)
				print('Player O wins: ', results_X.count(-1)/n*2)
				print('Draws        : ', results_X.count(0)/n*2)
				print('Winrate X    : ', 0.5 + 1.*sum(results_X)/n*2)

				print('Network as O (%s games):' % (n/2))
				print('Player O wins: ', results_O.count(-1)/n*2)
				print('Player X wins: ', results_O.count(1)/n*2)
				print('Draws        : ', results_O.count(0)/n*2)
				print('Winrate O    : ', 0.5 - 1.*sum(results_O)/n*2)

				"""

        with open(network_file_path + '_benchmark_vs_rand.json',
                  'w') as outfile:
            json.dump(results, outfile)
print(netlist)

opponent_index = 0

historical_input_layer, historical_output_layer, historical_variables, _ = create_network(
)

networks = []
for _ in netlist:
    net_input_layer, net_output_layer, net_variables, _ = create_network()
    networks.append((net_input_layer, net_output_layer, net_variables))

with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    print('location: ' + netloc)
    load_network(session, historical_variables, netloc + historic_net)
    print('loaded historic net: ' + historic_net)

    def make_move_historical(net, board_state, side):
        if mcts:
            _, move = monte_carlo_tree_search(game_spec, board_state, side, 27,
                                              session, input_layer,
                                              output_layer, True, cnn_on, True)
        else:
            # move = get_deterministic_network_move(session, net[0], net[1], board_state, side,
            # 										valid_only = True, game_spec = game_spec, cnn_on = cnn_on)
            move = get_stochastic_network_move(session,
                                               net[0],
                                               net[1],
                                               board_state,
                                               side,
Exemple #15
0
import collections