Beispiel #1
0
    def test_load_variables_into_network_of_wrong_size_gives_friendly_exception(
            self):
        try:
            file_name = 'test.p'
            input_nodes = 20

            _, _, variables1 = create_network(input_nodes, (30, ))
            _, _, variables2 = create_network(input_nodes, (40, ))

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())

                save_network(session, variables1, file_name)

                with self.assertRaises(ValueError):
                    load_network(session, variables2, file_name)
        finally:
            try:
                os.remove(file_name)
            except OSError:
                pass
Beispiel #2
0
    def test_save_and_load_network(self):
        try:
            file_name = 'test.p'
            input_nodes = 20
            hidden_nodes = (50, 40, 30)
            _, _, variables1 = create_network(input_nodes, hidden_nodes)
            _, _, variables2 = create_network(input_nodes, hidden_nodes)

            with tf.Session() as session:
                session.run(tf.global_variables_initializer())

                save_network(session, variables1, file_name)
                load_network(session, variables2, file_name)

                for var1, var2 in zip(variables1, variables2):
                    np.testing.assert_array_almost_equal(
                        session.run(var1), session.run(var2))
        finally:
            try:
                os.remove(file_name)
            except OSError:
                pass
Beispiel #3
0
def train_supervised(game_spec,
                     create_network,
                     network_file_path,
                     positions,
                     test_set_ratio=0.4,
                     regularization_coefficent=1e-5,
                     batch_size=100,
                     learn_rate=1e-4,
                     stop_turns_without_improvement=7):
    """Train a network using supervised learning using against a list of game positions and moves chosen.
    We stop after we have had stop_turns_without_improvement without an improvement in the test error.
    The test set is used as a validation set as well, will possibly improve this in the future to have a seperate test
     and validation set.

    Args:
        stop_turns_without_improvement (int): we stop training after this many iterations without any improvement in
            the test error.
        regularization_coefficent (float): amount to multiply the l2 regularizer by in the loss function
        test_set_ratio (float): portion of the data to divide into the test set,
        positions ([(board_state, move)]): list of tuples of board states and the moves chosen in those board_states
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        learn_rate (float):
        batch_size (int):

    Returns:
        episode_number, train_error, train_accuracy, new_test_error, test_accuracy
    """
    input_layer, output_layer, variables = create_network()

    test_set_count = int(len(positions) * test_set_ratio)
    train_set = positions[:-test_set_count]
    test_set = positions[-test_set_count:]

    actual_move_placeholder = tf.compat.v1.placeholder(
        "float", (None, game_spec.outputs()))

    error = tf.reduce_sum(input_tensor=tf.square(actual_move_placeholder -
                                                 output_layer))

    regularizer = None
    for var in variables:
        if regularizer is None:
            regularizer = tf.nn.l2_loss(var)
        else:
            regularizer += tf.nn.l2_loss(var)

    loss = error + regularizer * regularization_coefficent

    train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize(loss)

    correct_pred = tf.equal(tf.argmax(input=output_layer, axis=1),
                            tf.argmax(input=actual_move_placeholder, axis=1))
    accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_pred, tf.float32))

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())

        if os.path.isfile(network_file_path):
            print("loading existing network")
            load_network(session, variables, network_file_path)

        episode_number = 1
        turns_without_test_improvement = 0

        best_test_error, test_accuracy = session.run(
            [error, accuracy],
            feed_dict={
                input_layer: [x[0] for x in test_set],
                actual_move_placeholder: [x[1] for x in test_set]
            })

        while True:
            random.shuffle(train_set)
            train_error = 0

            for start_index in range(0,
                                     len(train_set) - batch_size + 1,
                                     batch_size):
                mini_batch = train_set[start_index:start_index + batch_size]

                batch_error, _ = session.run(
                    [error, train_step],
                    feed_dict={
                        input_layer: [x[0] for x in mini_batch],
                        actual_move_placeholder: [x[1] for x in mini_batch]
                    })
                train_error += batch_error

            new_test_error, test_accuracy = session.run(
                [error, accuracy],
                feed_dict={
                    input_layer: [x[0] for x in test_set],
                    actual_move_placeholder: [x[1] for x in test_set]
                })

            print("episode: %s train_error: %s test_error: %s test_acc: %s" %
                  (episode_number, train_error, new_test_error, test_accuracy))

            if new_test_error < best_test_error:
                best_test_error = new_test_error
                turns_without_test_improvement = 0
            else:
                turns_without_test_improvement += 1
                if turns_without_test_improvement > stop_turns_without_improvement:
                    train_accuracy = session.run(
                        [accuracy],
                        feed_dict={
                            input_layer: [x[0] for x in train_set],
                            actual_move_placeholder: [x[1] for x in train_set]
                        })

                    print(
                        "test error not improving for %s turns, ending training"
                        % (stop_turns_without_improvement, ))
                    break

            episode_number += 1

        print(
            "final episode: %s train_error: %s train acc: %s test_error: %s test_acc: %s"
            % (episode_number, train_error, train_accuracy, new_test_error,
               test_accuracy))

        save_network(session, variables, network_file_path)

    return episode_number, train_error, train_accuracy, new_test_error, test_accuracy
def train_policy_gradients_vs_historic(
        game_spec,
        create_network,
        network_file_path,
        save_network_file_path=None,
        number_of_historic_networks=8,
        save_historic_every=10000,
        historic_network_base_path='historic_network',
        number_of_games=100000,
        print_results_every=1000,
        learn_rate=1e-4,
        batch_size=100):
    """Train a network against itself and over time store new version of itself to play against.

    Args:
        historic_network_base_path (str): Bast path to save new historic networks to a number for the network "slot" is
            appended to the end of this string.
        save_historic_every (int): We save a version of the learning network into one of the historic network
            "slots" every x number of games. We have number_of_historic_networks "slots"
        number_of_historic_networks (int): We keep this many old networks to play against
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):

    Returns:
        [tf.Vaiables] : trained variables used in the final network
    """
    input_layer, output_layer, variables = create_network()

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None,
                                                    game_spec.board_squares()))
    policy_gradient = tf.reduce_sum(
        tf.reshape(reward_placeholder,
                   (-1, 1)) * actual_move_placeholder * output_layer)
    train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(
        -policy_gradient)

    current_historical_index = 0
    historical_networks = []

    mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
    results = collections.deque(maxlen=print_results_every)

    for _ in range(number_of_historic_networks):
        historical_input_layer, historical_output_layer, historical_variables = create_network(
        )
        historical_networks.append(
            (historical_input_layer, historical_output_layer,
             historical_variables))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        def make_move_historical(histoical_network_index, board_state, side):
            net = historical_networks[histoical_network_index]
            move = get_stochastic_network_move(session,
                                               net[0],
                                               net[1],
                                               board_state,
                                               side,
                                               valid_only=True,
                                               game_spec=game_spec)
            return game_spec.flat_move_to_tuple(move.argmax())

        def make_training_move(board_state, side):
            mini_batch_board_states.append(np.ravel(board_state) * side)
            move = get_stochastic_network_move(session,
                                               input_layer,
                                               output_layer,
                                               board_state,
                                               side,
                                               valid_only=True,
                                               game_spec=game_spec)
            mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move.argmax())

        if os.path.isfile(network_file_path):
            print("loading pre existing weights")
            load_network(session, variables, network_file_path)
        else:
            print("could not find previous weights so initialising randomly")

        for i in range(number_of_historic_networks):
            if os.path.isfile(historic_network_base_path + str(i) + '.p'):
                load_network(session, historical_networks[i][2],
                             historic_network_base_path + str(i) + '.p')
            elif os.path.isfile(network_file_path):
                # if we can't load a historical file use the current network weights
                load_network(session, historical_networks[i][2],
                             network_file_path)

        for episode_number in range(1, number_of_games):
            opponent_index = random.randint(0, number_of_historic_networks - 1)
            make_move_historical_for_index = functools.partial(
                make_move_historical, opponent_index)

            # randomize if going first or second
            if bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move,
                                             make_move_historical_for_index)
            else:
                reward = -game_spec.play_game(make_move_historical_for_index,
                                              make_training_move)

            results.append(reward)

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            episode_number += 1

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)
                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                print("episode: %s average result: %s" %
                      (episode_number, np.mean(results)))

            if episode_number % save_historic_every == 0:
                print("saving historical network %s", current_historical_index)
                save_network(
                    session, variables, historic_network_base_path +
                    str(current_historical_index) + '.p')
                load_network(
                    session, historical_networks[current_historical_index][2],
                    historic_network_base_path +
                    str(current_historical_index) + '.p')

                # also save to the main network file
                save_network(session, variables, save_network_file_path
                             or network_file_path)

                current_historical_index += 1
                current_historical_index %= number_of_historic_networks

        # save our final weights
        save_network(session, variables, save_network_file_path
                     or network_file_path)

    return variables
def train_value_network(game_spec,
                        hidden_nodes_reinforcement,
                        reinforcement_network_file_path,
                        hidden_nodes_value,
                        value_network_file_path,
                        learn_rate=1e-4,
                        batch_size=100,
                        train_samples=10000,
                        test_samples=8000):
    reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
        game_spec.board_squares(), hidden_nodes_reinforcement,
        game_spec.outputs())

    value_input_layer, value_output_layer, value_variables = create_network(
        game_spec.board_squares(),
        hidden_nodes_value,
        output_nodes=1,
        output_softmax=False)

    target_placeholder = tf.compat.v1.placeholder("float", (None, 1))
    error = tf.reduce_sum(input_tensor=tf.square(target_placeholder -
                                                 value_output_layer))

    train_step = tf.compat.v1.train.RMSPropOptimizer(learn_rate).minimize(
        error)

    with tf.compat.v1.Session() as session:
        session.run(tf.compat.v1.global_variables_initializer())

        load_network(session, reinforcement_variables,
                     reinforcement_network_file_path)

        if os.path.isfile(value_network_file_path):
            print("loading previous version of value network")
            load_network(session, value_variables, value_network_file_path)

        def make_move(board_state, side):
            move = get_deterministic_network_move(session,
                                                  reinforcement_input_layer,
                                                  reinforcement_output_layer,
                                                  board_state, side)

            return game_spec.flat_move_to_tuple(np.argmax(move))

        board_states_training = {}
        board_states_test = []
        episode_number = 0

        while len(board_states_training) < train_samples + test_samples:
            board_state = _generate_random_board_position(
                game_spec, (1, game_spec.board_squares() * 0.8))
            board_state_flat = tuple(np.ravel(board_state))

            # only accept the board_state if not already in the dict
            if board_state_flat not in board_states_training:
                result = game_spec.play_game(make_move,
                                             make_move,
                                             board_state=board_state)
                board_states_training[board_state_flat] = float(result)

        # take a random selection from training into a test set
        for _ in range(test_samples):
            sample = random.choice(board_states_training.keys())
            board_states_test.append((sample, board_states_training[sample]))
            del board_states_training[sample]

        board_states_training = list(board_states_training.iteritems())

        test_error = session.run(error,
                                 feed_dict={
                                     value_input_layer:
                                     [x[0] for x in board_states_test],
                                     target_placeholder:
                                     [[x[1]] for x in board_states_test]
                                 })

        while True:
            np.random.shuffle(board_states_training)
            train_error = 0

            for start_index in range(
                    0,
                    len(board_states_training) - batch_size + 1, batch_size):
                mini_batch = board_states_training[start_index:start_index +
                                                   batch_size]

                batch_error, _ = session.run(
                    [error, train_step],
                    feed_dict={
                        value_input_layer: [x[0] for x in mini_batch],
                        target_placeholder: [[x[1]] for x in mini_batch]
                    })
                train_error += batch_error

            new_test_error = session.run(error,
                                         feed_dict={
                                             value_input_layer:
                                             [x[0] for x in board_states_test],
                                             target_placeholder:
                                             [[x[1]]
                                              for x in board_states_test]
                                         })

            print("episode: %s train_error: %s test_error: %s" %
                  (episode_number, train_error, test_error))

            if new_test_error > test_error:
                print("train error went up, stopping training")
                break

            test_error = new_test_error
            episode_number += 1

        save_network(session, value_variables, value_network_file_path)
def train_policy_gradients_vs_historic(
        game_spec,
        create_network,
        load_network_file_path,
        save_network_file_path=None,
        number_of_historic_networks=1,
        historic_network_base_path='historic_network',
        number_of_games=10000,
        update_opponent_winrate=0.65,
        print_results_every=100,
        learn_rate=1e-3,
        batch_size=100,
        cnn_on=False,
        eps=0.1,
        deterministic=True,
        mcts=False,
        min_win_ticks=3,
        beta=0.01):
    """Train a network against itself and over time store new version of itself to play against.

    Args:
        historic_network_base_path (str): Base path to save new historic networks to a number for the network "slot" is
            appended to the end of this string.
        save_historic_every (int): We save a version of the learning network into one of the historic network
            "slots" every x number of games. We have number_of_historic_networks "slots"
        number_of_historic_networks (int): We keep this many old networks to play against
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the load_network_file_path param is used.
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        load_network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        update_opponent_winrate (float): the required winrate before updating the opponent to the newer agent
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):
        cnn_on (bool): use convolutional or regular neural network
        eps (float): fraction of moves made randomly
        deterministic (bool): use deterministic or stochastic move selection
        min_win_ticks (int): number of times the networks winrate needs to exceed update_opponent_winrate to update

    Returns:
        [tf.Variables] : trained variables used in the final network
    """
    save_network_file_path = save_network_file_path or load_network_file_path
    # create folder if it does not exist
    if save_network_file_path:
        split = save_network_file_path.split('/')
        directory = '/'.join(split[:-1]) or '.'
        if not os.path.isdir(directory):
            os.makedirs(directory)
            print("created directory " + directory)

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables, weights = create_network()

    baseline = np.zeros([100, 1])
    baselineCounter = 0
    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer),
                      axis=1)) * (reward_placeholder - np.mean(baseline))
    #policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) #Original one from historic
    #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient) # Why is this one different from the other train policy grad?

    regularizer = sum([tf.nn.l2_loss(i) for i in weights])
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient +
                                                             beta *
                                                             regularizer)

    current_historical_index = 0  # We will (probably) not use this: we always train against the most recent agent
    historical_networks = []

    mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
    results = collections.deque(maxlen=print_results_every)

    for _ in range(number_of_historic_networks):
        historical_input_layer, historical_output_layer, historical_variables, _ = create_network(
        )
        historical_networks.append(
            (historical_input_layer, historical_output_layer,
             historical_variables))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        #testvar = variables[0] #\\ test for checking variables
        #print(session.run(variables[0]))
        base_episode_number = 0
        winrates = []

        if load_network_file_path and os.path.isfile(load_network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, load_network_file_path)
            base_episode_number, winrates = load_results(
                load_network_file_path)
        else:
            print('Creating new network')

        def make_move_historical(historical_network_index, board_state, side):
            net = historical_networks[historical_network_index]
            #move = get_stochastic_network_move(session, net[0], net[1], board_state, side,
            #                                  valid_only=True, game_spec=game_spec, CNN_ON=cnn_on)
            if mcts:
                _, move = monte_carlo_tree_search(game_spec, board_state, side,
                                                  27, session, input_layer,
                                                  output_layer, True, cnn_on,
                                                  True)
            else:
                move = get_deterministic_network_move(session,
                                                      net[0],
                                                      net[1],
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)

            move_for_game = np.asarray(
                move)  # move must be an array, mcts doesn't return this
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        def make_training_move(board_state, side):
            if cnn_on:
                np_board_state = create_3x3_board_states(board_state)
            else:
                np_board_state = np.array(board_state)

            mini_batch_board_states.append(np_board_state * side)

            rand_numb = random.uniform(0., 1.)
            if rand_numb < eps:
                move = get_random_network_move(board_state, game_spec)
            elif deterministic:
                move = get_deterministic_network_move(session,
                                                      input_layer,
                                                      output_layer,
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)
            else:
                if mcts:
                    _, move = monte_carlo_tree_search(game_spec, board_state,
                                                      side, 27, session,
                                                      input_layer,
                                                      output_layer, True,
                                                      cnn_on, True)
                else:
                    move = get_stochastic_network_move(session,
                                                       input_layer,
                                                       output_layer,
                                                       board_state,
                                                       side,
                                                       valid_only=True,
                                                       game_spec=game_spec,
                                                       cnn_on=cnn_on)

            move_for_game = np.asarray(
                move
            )  # The move returned to the game is in a different configuration than the CNN learn move
            if cnn_on:
                # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state),
                # the same should happen for the mini batch moves
                move = create_3x3_board_states(np.reshape(
                    move, [9, 9]))  # The function requires a 9x9 array
                mini_batch_moves.append(move[0:81])
            else:
                mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        #for i in range(number_of_historic_networks):
        if os.path.isfile(historic_network_base_path + str(0) + '.p'):
            load_network(session, historical_networks[0][2],
                         historic_network_base_path + str(0) + '.p')
            print('Historic network loaded')
        else:
            # if we can't load a historical file use the current network weights
            print(
                'Warning: loading historical file failed. Current net is saved and being used as historic net.'
            )
            historic_filename = historic_network_base_path + str(
                current_historical_index) + '.p'
            save_network(session, variables, historic_filename)
            load_network(session,
                         historical_networks[current_historical_index][2],
                         historic_filename)

        win_ticks = 0  # registers the amount of times the agent has a high enough winrate to update its opponent
        for episode_number in range(1, number_of_games):
            opponent_index = random.randint(0, number_of_historic_networks - 1)
            make_move_historical_for_index = functools.partial(
                make_move_historical, opponent_index)

            # randomize if going first or second
            if bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move,
                                             make_move_historical_for_index)
            else:
                reward = -game_spec.play_game(make_move_historical_for_index,
                                              make_training_move)

            results.append(reward)
            baseline[baselineCounter] = reward
            baselineCounter += 1
            baselineCounter = baselineCounter % 100

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)
            reward /= float(last_game_length)
            mini_batch_rewards += ([reward] * last_game_length)
            episode_number += 1

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                winrate = _win_rate(print_results_every, results)
                if winrate == 0:
                    print('DEBUG TEST')
                winrates.append(
                    [base_episode_number + episode_number, winrate])
                print("episode: %s win_rate: %s" %
                      (base_episode_number + episode_number, winrate))
                if save_network_file_path:
                    save_network(
                        session, variables,
                        time.strftime(save_network_file_path[:-2] + "_ep" +
                                      str(base_episode_number +
                                          episode_number) +
                                      "_%Y-%m-%d_%H%M%S.p"))

            # Update opponent when winrate is high enough and it happens for a longer period
            if (episode_number % print_results_every
                    == 0) and (winrate >= update_opponent_winrate):
                win_ticks += 1
                if win_ticks >= min_win_ticks:
                    win_ticks = 0
                    first_bot = False
                    print("saving historical network %s at episode %s." %
                          (current_historical_index,
                           base_episode_number + episode_number)
                          )  # Overwrite historic opponent with current network
                    historic_filename = historic_network_base_path + str(
                        current_historical_index) + '.p'
                    save_network(session, variables, historic_filename)
                    load_network(
                        session,
                        historical_networks[current_historical_index][2],
                        historic_filename)

                    # also save to the main network file
                    save_network(session, variables,
                                 (save_network_file_path
                                  or load_network_file_path)[:-2] + "_ep" +
                                 str(base_episode_number + episode_number) +
                                 ".p")

                    current_historical_index += 1  # Not used when we only have 1 historic network
                    current_historical_index %= number_of_historic_networks

        # save our final weights
        save_network(session, variables, save_network_file_path
                     or load_network_file_path)

    return variables, _win_rate(print_results_every, results), winrates
Beispiel #7
0
            mini_batch = board_states_training[start_index:start_index +
                                               BATCH_SIZE]

            batch_error, _ = session.run(
                [error, train_step],
                feed_dict={
                    value_input_layer: [x[0] for x in mini_batch],
                    target_placeholder: [[x[1]] for x in mini_batch]
                })
            train_error += batch_error

        new_test_error = session.run(error,
                                     feed_dict={
                                         value_input_layer:
                                         [x[0] for x in board_states_test],
                                         target_placeholder:
                                         [[x[1]] for x in board_states_test]
                                     })

        print("episode: %s train_error: %s test_error: %s" %
              (episode_number, train_error, test_error))

        if new_test_error > test_error:
            print("train error went up, stopping training")
            break

        test_error = new_test_error
        episode_number += 1

    save_network(session, value_variables, VALUE_NETWORK_PATH)
def train_policy_gradients(game_spec,
                           create_network,
                           load_network_file_path,
                           save_network_file_path=None,
                           opponent_func=None,
                           number_of_games=10000,
                           print_results_every=1000,
                           learn_rate=1e-4,
                           batch_size=100,
                           randomize_first_player=True,
                           cnn_on=False,
                           eps=0.1,
                           deterministic=True,
                           mcts=False,
                           beta=0.01):
    """Train a network using policy gradients

    Args:
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing
            randomly
        randomize_first_player (bool): If True we alternate between being the first and second player
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        load_network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):
        cnn_on: if True, then the convolutional neural network is used

    Returns:
        (variables used in the final network : list, win rate: float)
    """

    save_network_file_path = save_network_file_path or load_network_file_path
    # create folder if it does not exist
    if save_network_file_path:
        split = save_network_file_path.split('/')
        directory = '/'.join(split[:-1]) or '.'
        if not os.path.isdir(directory):
            os.makedirs(directory)
            print("created directory " + directory)

    if mcts:
        opponent_func = game_spec.get_monte_carlo_player_func(
            number_of_samples=27)
    else:
        opponent_func = opponent_func or game_spec.get_random_player_func()

    reward_placeholder = tf.placeholder("float", shape=(None, ))
    actual_move_placeholder = tf.placeholder("float",
                                             shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables, weights = create_network()
    baseline = np.zeros([100, 1])
    baselineCounter = 0
    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer),
                      axis=1)) * (reward_placeholder - np.mean(baseline))

    #regularizer = sum([tf.nn.l2_loss(i) for i in weights])
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(
        -policy_gradient)  # + beta * regularizer)
    #train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        # load existing network and keep track of number of games played
        base_episode_number = 0
        winrates = []
        if load_network_file_path and os.path.isfile(load_network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, load_network_file_path)
            base_episode_number, winrates = load_results(
                load_network_file_path)

        mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
        results = collections.deque(maxlen=print_results_every)

        def make_training_move(board_state, side):
            if cnn_on:
                # We must have the first 3x3 board as first 9 entries of the list, second 3x3 board as next 9 entries etc.
                # This is required for the CNN. The CNN takes the first 9 entries and forms a 3x3 board etc.
                """If the 10 split 3x3 boards are desired, use create_3x3_board_states(board_state) here"""
                np_board_state = create_3x3_board_states(board_state)
            else:
                np_board_state = np.array(board_state)
            np_board_state[np_board_state > 1] = 0
            mini_batch_board_states.append(
                np_board_state * side
            )  # append all states are used in the minibatch (+ and - determine which player's state it was)

            rand_numb = random.uniform(0., 1.)
            if rand_numb < eps:
                move = get_random_network_move(board_state, game_spec)
            elif deterministic:
                move = get_deterministic_network_move(session,
                                                      input_layer,
                                                      output_layer,
                                                      board_state,
                                                      side,
                                                      valid_only=True,
                                                      game_spec=game_spec,
                                                      cnn_on=cnn_on)
            else:
                if mcts:
                    _, move = monte_carlo_tree_search(game_spec, board_state,
                                                      side, 27, session,
                                                      input_layer,
                                                      output_layer, True,
                                                      cnn_on, True)
                else:
                    move = get_stochastic_network_move(session,
                                                       input_layer,
                                                       output_layer,
                                                       board_state,
                                                       side,
                                                       valid_only=True,
                                                       game_spec=game_spec,
                                                       cnn_on=cnn_on)
            move_for_game = np.asarray(
                move
            )  # The move returned to the game is in a different configuration than the CNN learn move
            if cnn_on:
                # Since the mini batch states is saved the same way it should enter the neural net (the adapted board state),
                # the same should happen for the mini batch moves
                move = create_3x3_board_states(np.reshape(
                    move, [9, 9]))  # The function requires a 9x9 array
                mini_batch_moves.append(move[0:81])
            else:
                mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move_for_game.argmax())

        for episode_number in range(1, number_of_games + 1):
            # randomize if going first or second
            if (not randomize_first_player) or bool(random.getrandbits(1)):
                reward = game_spec.play_game(
                    make_training_move,
                    opponent_func)  # In this line one game is played.
            else:
                reward = -game_spec.play_game(opponent_func,
                                              make_training_move)

            results.append(reward)
            baseline[baselineCounter] = reward
            baselineCounter += 1
            baselineCounter = baselineCounter % 100
            # we scale here so winning quickly is better winning slowly and losing slowly better than losing quickly
            last_game_length = len(mini_batch_board_states) - len(
                mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(
                    mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step,
                            feed_dict={
                                input_layer: np_mini_batch_board_states,
                                reward_placeholder: normalized_rewards,
                                actual_move_placeholder: mini_batch_moves
                            })

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                winrate = _win_rate(print_results_every, results)
                winrates.append(
                    [base_episode_number + episode_number, winrate])
                print("episode: %s win_rate: %s" %
                      (base_episode_number + episode_number, winrate))
                if save_network_file_path:
                    save_network(
                        session, variables,
                        time.strftime(save_network_file_path[:-2] + "_ep" +
                                      str(base_episode_number +
                                          episode_number) +
                                      "_%Y-%m-%d_%H%M%S.p"))

        if save_network_file_path:
            save_network(session, variables, save_network_file_path)

    return variables, _win_rate(print_results_every, results), winrates
Beispiel #9
0
def train_policy_gradients(game_spec,
                           create_network,
                           network_file_path,
                           save_network_file_path=None,
                           opponent_func=None,
                           number_of_games=10000,
                           print_results_every=1000,
                           learn_rate=1e-4,
                           batch_size=100,
                           randomize_first_player=True):
    """Train a network using policy gradients

    Args:
        save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
            the network_file_path param is used.
        opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing
            randomly
        randomize_first_player (bool): If True we alternate between being the first and second player
        game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
        create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
            Method that creates the network we will train.
        network_file_path (str): path to the file with weights we want to load for this network
        number_of_games (int): number of games to play before stopping
        print_results_every (int): Prints results to std out every x games, also saves the network
        learn_rate (float):
        batch_size (int):

    Returns:
        (variables used in the final network : list, win rate: float)
    """
    save_network_file_path = save_network_file_path or network_file_path
    opponent_func = opponent_func or game_spec.get_random_player_func()
    reward_placeholder = tf.placeholder("float", shape=(None,))
    actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs()))

    input_layer, output_layer, variables = create_network()

    policy_gradient = tf.log(
        tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient)

    with tf.Session() as session:
        session.run(tf.initialize_all_variables())

        if network_file_path and os.path.isfile(network_file_path):
            print("loading pre-existing network")
            load_network(session, variables, network_file_path)

        mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
        results = collections.deque(maxlen=print_results_every)

        def make_training_move(board_state, side):
            mini_batch_board_states.append(np.ravel(board_state) * side)
            move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side)
            mini_batch_moves.append(move)
            return game_spec.flat_move_to_tuple(move.argmax())

        for episode_number in range(1, number_of_games):
            # randomize if going first or second
            if (not randomize_first_player) or bool(random.getrandbits(1)):
                reward = game_spec.play_game(make_training_move, opponent_func)
            else:
                reward = -game_spec.play_game(opponent_func, make_training_move)

            results.append(reward)

            # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
            last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)

            reward /= float(last_game_length)

            mini_batch_rewards += ([reward] * last_game_length)

            if episode_number % batch_size == 0:
                normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)

                rewards_std = np.std(normalized_rewards)
                if rewards_std != 0:
                    normalized_rewards /= rewards_std
                else:
                    print("warning: got mini batch std of 0.")

                np_mini_batch_board_states = np.array(mini_batch_board_states) \
                    .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])

                session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states,
                                                   reward_placeholder: normalized_rewards,
                                                   actual_move_placeholder: mini_batch_moves})

                # clear batches
                del mini_batch_board_states[:]
                del mini_batch_moves[:]
                del mini_batch_rewards[:]

            if episode_number % print_results_every == 0:
                print("episode: %s win_rate: %s" % (episode_number, _win_rate(print_results_every, results)))
                if network_file_path:
                    save_network(session, variables, save_network_file_path)

        if network_file_path:
            save_network(session, variables, save_network_file_path)

    return variables, _win_rate(print_results_every, results)
Beispiel #10
0
import collections