Esempio n. 1
0
def train(board_size, max_timesteps):
    """train gomoku AI play board whose size is board_size x board_size.

    Parameters
    ----------
    board_size: int
        Size of board in one dimension, example:
        board_size = 9 --> board have size 9x9
    max_timesteps: int
        Number of training step

    Returns
    -------
    None
    """
    env = gym.make(
        'Gomoku{}x{}-arena-v0'.format(board_size, board_size))
    val_env = gym.make(
        'Gomoku{}x{}-arena-v0'.format(board_size, board_size), __val_opponent_policy)

    # Enabling layer_norm here is import for parameter space noise!
    capility = 64
    num_conv_layer = 8
    conv_layers = [(capility, 3, 1)] * num_conv_layer
    hidden_layers = [capility]

    model = deepq.models.cnn_to_mlp(
        convs=conv_layers,
        hiddens=hidden_layers,
    )

    timesteps_to_explore = 800000

    act = deepq.learn(
        env=env,
        val_env=val_env,
        q_func=model,
        max_timesteps=max_timesteps,
        lr=1e-4,
        buffer_size=400000,
        batch_size=512,
        exploration_fraction=(timesteps_to_explore / max_timesteps),
        exploration_final_eps=0.35,
        train_freq=4,
        val_freq=1000,
        print_freq=100,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=False,
        deterministic_filter=True,
        random_filter=True,
        state_file='kaithy_cnn_to_mlp_{}_model.pkl'.format(board_size),
    )

    print('Saving model to kaithy_cnn_to_mlp_{}_model.pkl'.format(
        board_size))
    act.save('kaithy_cnn_to_mlp_{}_model.pkl'.format(board_size))
Esempio n. 2
0
def enjoy(board_size):
    """enjoy trained gomoku AI play board whose size is board_size x board_size.

    Parameters
    ----------
    board_size: int
        Size of board in one dimension, example:
        board_size = 9 --> board have size 9x9

    Returns
    -------
    None
    """
    env = gym.make('Gomoku{}x{}-arena-v0'.format(board_size, board_size),
                   __val_opponent_policy)
    act = deepq.load("kaithy_cnn_to_mlp_{}_model.pkl".format(board_size))
    # Enabling layer_norm here is import for parameter space noise!

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            obs, rew, done, _ = env.step(act(obs[None], stochastic=False)[0])
            episode_rew += rew
            env.render()
        print('Episode reward', episode_rew)
        input('Hit enter to play next match')
        print('Swap color')
        env.swap_role()
Esempio n. 3
0
def main():
    '''
    AI Self-training program
    '''
    deterministic_actions_filter = True

    env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy)

    obs_ph = tf.placeholder(
        dtype=tf.float32, shape=[None] + list(env.observation_space.shape))

    if deterministic_actions_filter:
        invalid_masks = tf.reduce_sum(obs_ph, axis=3)

    sess = tf.Session()

    observations = []

    for i in range(2):
        observation = env.reset()
        done = None

        while not done:
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            observations.append(observation)
            env.render()
    out = sess.run(invalid_masks, feed_dict={
        obs_ph: observations})
    print(out)
    print(out.shape)
def main():
    '''
    AI Self-training program
    '''
    class Opponent(object):
        def __init__(self):
            self.__old_obs = None
            self.__old_action = None
            self.__obs = None

        def policy(self, curr_state, prev_state, prev_action):
            '''
            Define policy for opponent here
            '''
            return gym.gym_gomoku.envs.util.make_beginner_policy(np.random)(curr_state, prev_state, prev_action)

    opponent = Opponent()
    env = gym.make('Gomoku5x5-training-camp-v0')
    env.opponent_policy = opponent.policy

    for i in range(2):
        observation = env.reset()
        done = None

        while not done:
            action = env.action_space.sample()  # sample without replacement
            observation, reward, done, info = env.step(action)
            env.render()

        env.swap_role()
        print("\n----SWAP----\n")
Esempio n. 5
0
def main():
    '''
    AI Self-training program
    '''
    env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy)
    env.reset()

    action = env.action_space.sample()  # sample without replacement
    observation, reward, done, info = env.step(action)
Esempio n. 6
0
def main():
    env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy)
    model = models.mlp([64])
    act = simple.learn(env,
                       q_func=model,
                       lr=1e-3,
                       max_timesteps=100000,
                       buffer_size=50000,
                       exploration_fraction=0.1,
                       exploration_final_eps=0.02,
                       print_freq=10,
                       callback=callback)
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
def main():
    '''
    AI Self-training program
    '''
    env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy)

    for i in range(2):
        observation = env.reset()
        done = None

        while not done:
            action = env.action_space.sample()  # sample without replacement
            observation, reward, done, info = env.step(action)
            env.render()

        env.swap_role()
        print("\n----SWAP----\n")
Esempio n. 8
0
import matplotlib.pyplot as plt
import scipy.misc
import os
#%matplotlib inline


def opponent_policy(curr_state, prev_state, prev_action):
    '''
    Define policy for opponent here
    '''
    return gym_gomoku.envs.util.make_beginner_policy(np.random)(curr_state,
                                                                prev_state,
                                                                prev_action)


env = gym.make('Gomoku9x9-training-camp-v0', opponent_policy)
env.reset()


class Qnetwork():
    def __init__(self, h_size):
        # The network recieves a frame from the game, flattened into an array.
        # It then resizes it and processes it through four convolutional layers.
        self.scalarInput = tf.placeholder(shape=[None, 81], dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput, shape=[-1, 9, 9, 2])
        self.conv1 = slim.conv2d(inputs=self.imageIn,
                                 num_outputs=3,
                                 kernel_size=[2, 2],
                                 stride=[1, 1],
                                 padding='VALID',
                                 biases_initializer=None)
def main():
    '''
    AI Self-training program
    '''
    deterministic_filter = True
    random_filter = True

    env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy)

    num_actions = env.action_space.n

    obs_ph = tf.placeholder(
        dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
    q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions)

    if deterministic_filter or random_filter:
        invalid_masks = tf.contrib.layers.flatten(
            tf.reduce_sum(obs_ph[:, :, :, 1:3], axis=3))

    if deterministic_filter:
        q_values_worst = tf.reduce_min(q_values, axis=1, keep_dims=True)
        # q_values = tf.where(tf.equal(
        #     invalid_masks, 1.), q_values_worst - 1.0, q_values)
        q_values = invalid_masks * (q_values_worst - 1.0) + \
            (1.0 - invalid_masks) * q_values

    deterministic_actions = tf.argmax(q_values, axis=1, output_type=tf.int32)
    batch_size = tf.shape(obs_ph)[0]
    stochastic_ph = tf.constant(True, dtype=tf.bool)
    random_actions = tf.random_uniform(
        tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int32)

    if random_filter:
        def get_elements(data, indices):
            indeces = tf.range(0, tf.shape(indices)[
                0]) * data.shape[1] + indices
            return tf.gather(tf.reshape(data, [-1]), indeces)
        is_invalid_random_actions = get_elements(
            invalid_masks, random_actions)
        random_actions = tf.where(tf.equal(
            is_invalid_random_actions, 1.), deterministic_actions, random_actions)

    chose_random = tf.random_uniform(
        tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < 0.9
    stochastic_actions = tf.where(
        chose_random, random_actions, deterministic_actions)

    output_actions = tf.where(
        stochastic_ph, stochastic_actions, deterministic_actions)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    alo = optimizer.minimize(q_values)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    observations = []

    for i in range(2):
        observation = env.reset()
        done = None

        while not done:
            action = sess.run(output_actions, feed_dict={
                obs_ph: observation[None]})[0]
            observation, reward, done, info = env.step(action)
            env.render()
            observations.append(observation)

        print(reward)
        env.swap_role()
        print("\n----SWAP----\n")

    actions = sess.run(output_actions, feed_dict={
        obs_ph: observations})
    sess.run(q_values, feed_dict={
        obs_ph: observations})
    print(actions)
Esempio n. 10
0
def main():
    '''
    AI Self-training program
    '''
    deterministic_filter = True
    random_filter = True
    env = gym.make('Gomoku5x5-training-camp-v0', opponent_policy)

    num_actions = env.action_space.n

    # obs_ph = tf.placeholder(
    #     dtype=tf.float32, shape=[None] + list(env.observation_space.shape))
    # q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions)
    def make_obs_ph(name):
        obs_shape = env.observation_space.shape

        if flatten_obs:
            flattened_env_shape = 1
            for dim_size in env.observation_space.shape:
                flattened_env_shape *= dim_size
            obs_shape = (flattened_env_shape, )

        return U.BatchInput(obs_shape, name=name)

    #  Create batch aumentation for obs ------------------------------------------

    obs_t_input = tf.placeholder(dtype=tf.float32,
                                 shape=list(env.observation_space.shape))

    list_obs = []
    list_obs.append(obs_t_input)
    for i in range(0, 8):
        if (i > 0 and i < 4):
            list_obs.append(tf.image.rot90(obs_t_input, k=i))
        if (i == 4):
            list_obs.append(tf.image.flip_left_right(obs_t_input))
        if (i > 4 and i < 8):
            list_obs.append(tf.image.rot90(obs_t_input, k=(i - 4)))

    obs_ph = tf.stack(list_obs)

    # end create augmentation----------------------------------------

    q_values = layers.fully_connected(layers.flatten(obs_ph), num_actions)
    if deterministic_filter or random_filter:
        invalid_masks = tf.contrib.layers.flatten(
            tf.reduce_sum(obs_ph[:, :, :, 1:3], axis=3))
        # print(tf.shape(invalid_masks))
        # exit(0)

    if deterministic_filter:
        q_values_worst = tf.reduce_min(q_values, axis=1, keep_dims=True)
        # q_values = tf.where(tf.equal(
        #     invalid_masks, 1.), q_values_worst - 1.0, q_values)
        q_values = invalid_masks * (q_values_worst - 1.0) + \
            (1.0 - invalid_masks) * q_values

    deterministic_actions = tf.argmax(q_values, axis=1, output_type=tf.int32)
    batch_size = tf.shape(obs_ph)[0]
    stochastic_ph = tf.constant(True, dtype=tf.bool)
    random_actions = tf.random_uniform(tf.stack([batch_size]),
                                       minval=0,
                                       maxval=num_actions,
                                       dtype=tf.int32)

    if random_filter:

        def get_elements(data, indices):
            indeces = tf.range(0,
                               tf.shape(indices)[0]) * data.shape[1] + indices
            return tf.gather(tf.reshape(data, [-1]), indeces)

        is_invalid_random_actions = get_elements(invalid_masks, random_actions)
        random_actions = tf.where(tf.equal(is_invalid_random_actions, 1.),
                                  deterministic_actions, random_actions)

    chose_random = tf.random_uniform(
        tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < 0.9
    stochastic_actions = tf.where(chose_random, random_actions,
                                  deterministic_actions)

    output_actions = tf.where(stochastic_ph, stochastic_actions,
                              deterministic_actions)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    alo = optimizer.minimize(q_values)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    observations = []

    for i in range(2):
        observation = env.reset()
        done = None
        while not done:

            #  create action rotate -----------------------------

            def rotate_action(board_size, pos_1D, k):
                """
                Function rotate board
                    :param board_size: size of board 
                    :param pos_1D: position in board
                    :param k:   1: rotate 90
                                2: rotate 180
                                3: rotate 270
                """
                pos_2D = (pos_1D // board_size, pos_1D % board_size)
                # rot90
                if (k == 1):
                    rot_pos = pos_2D[0] + (board_size - 1 -
                                           pos_2D[1]) * board_size
                # rot180
                if (k == 2):
                    rot_pos = (board_size - 1 - pos_2D[0]) * board_size + (
                        board_size - 1 - pos_2D[1])
                # rot270
                if (k == 3):
                    rot_pos = (board_size - 1 -
                               pos_2D[0]) + pos_2D[1] * board_size
                return rot_pos

            def flip_action(board_size, pos_1D, k):
                """
                Flip board and rotate
                    :param board_size: size of board
                    :param pos_1D: position in board
                    :param k:   0: only flip
                                1: flip and rotate 90
                                2: flip and rotate 180
                                3: flip and rotate 270
                """
                pos_2D = (pos_1D // board_size, pos_1D % board_size)
                # flip and rot 0
                if (k == 0):
                    flip_rot = pos_2D[
                        0] * board_size + -pos_2D[1] + board_size - 1
                # flip and rot 90
                if (k == 1):
                    flip_rot = pos_2D[1] * board_size + pos_2D[0]
                # flip and rot 180
                if (k == 2):
                    flip_rot = (-pos_2D[0] + board_size -
                                1) * board_size + pos_2D[1]
                # flip and rot 270
                if (k == 3):
                    flip_rot = (-pos_2D[1] + board_size -
                                1) * board_size + -pos_2D[0] + board_size - 1
                return flip_rot

            # run to get action from AI
            actions = sess.run(output_actions,
                               feed_dict={obs_t_input: observation})

            # Get first valid action
            action = actions[0]

            # Rotate this action
            for i in range(1, 8):
                if (i < 4):
                    actions[i] = rotate_action(observation.shape[0], action, i)
                else:
                    actions[i] = flip_action(observation.shape[0], action,
                                             (i - 4))

            # END create actions --------------------------------

            observation, reward, done, info = env.step(action)

            angle = 1
            flip_action(observation.shape[0], action, angle)
            print(action, rotate_action(observation.shape[0], action, angle))
            # exit(0)
            #  observation flip and rotate
            print(observation[:, :, 1], env.observation_space.shape[0:2])
            # exit(0)
            obs_temp_ph = tf.placeholder(dtype=tf.int32,
                                         shape=(env.observation_space.shape))
            k = tf.placeholder(tf.int32)
            # tf_img = tf.image.rot90(obs_temp_ph, k = k)
            # tf_img1 = tf.image.flip_left_right(obs_temp_ph)
            tf_img = tf.image.rot90(obs_temp_ph, k=k)
            # tf_img = tf.image.flip_left_right(obs_temp_ph)
            rotated_img = sess.run(tf_img,
                                   feed_dict={
                                       obs_temp_ph: observation,
                                       k: angle
                                   })
            # rotated_img = sess.run(tf_img1, feed_dict = {obs_temp_ph: observation})
            print(rotated_img[:, :, 1])
            exit(0)

            # end obser
            env.render()
            observations.append(observation)

        print(reward)
        env.swap_role()
        print("\n----SWAP----\n")