Exemple #1
0
def playGame(sess,net):
    # open up a game state to communicate with emulator
    game_state = game.GameState()
    agent = Agent.Agent(sess)

    x_t, r_0, terminal = game_state.frame_step([1, 0, 0])
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    aux_s = s_t

    # get the first state by doing nothing and preprocess the image to 80x80x4

    score = 0
    while not terminal:

        # choose an action
        action = agent.choose_action_play(net, s_t)

        # run the selected action and observe next state and reward
        x_t1_col, r_t, terminal = game_state.frame_step(action)
        score += r_t
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        aux_s = np.delete(s_t, 0, axis = 2)
        s_t1 = np.append(aux_s, x_t1, axis = 2)

        # update state and score
        s_t = s_t1


    # Print final score
    print "FINAL SCORE1", score
Exemple #2
0
    def __init__(self, game_name):
        if game_name == "pong":
            # open up a game state to communicate with emulator
            import pong_fun as game
            self.game_name = "pong"
            self.game_state = game.GameState()
            self.action_number = 3

        if game_name == "gym":
            self.game_name = "gym"
            #self.game_state = gym.make('FlappyBird-v0')
            self.game_state = gym.make('Breakout-v0')
            self.action_number = self.game_state.action_space.n

        if game_name == "bird_black":
            # open up a game state to communicate with emulator
            import wrapped_flappy_bird as game
            self.game_name = "bird_black"
            self.game_state = game.GameState()
            self.action_number = 2
Exemple #3
0
def playGame(sess):
    # open up a game state to communicate with emulator
    game_state = game.GameState()

    score = 0

    # get the first state by doing nothing and preprocess the image to 80x80x4
    x_t, r_0, terminal = game_state.frame_step([1, 0, 0])
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    aux_s = s_t

    t = 0
    while not terminal:

        # choose an action
        readout_t = O_readout.eval(session=sess, feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])

        action_index = np.argmax(readout_t)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        x_t1_col, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        aux_s = np.delete(s_t, 0, axis=2)
        s_t1 = np.append(aux_s, x_t1, axis=2)

        # update state and score
        s_t = s_t1
        t += 1
        score += r_t

        print "TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t
        print readout_t

    # Print final score
    print "FINAL SCORE", score
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    #a_file = open("logs_" + GAME + "/readout.txt", 'w')
    #h_file = open("logs_" + GAME + "/hidden.txt", 'w')
    plt.show()
    axes = plt.gca()
    axes.set_xlim(0, 10000)
    axes.set_ylim(-0.5, 0.5)
    line, = axes.plot(xdata, reworddata, 'r-')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    print('x siz', x_t.min())
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    #saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    #checkpoint = tf.train.get_checkpoint_state("saved_networks")
    #if checkpoint and checkpoint.model_checkpoint_path:
    #   saver.restore(sess, checkpoint.model_checkpoint_path)
    #   print("Successfully loaded:", checkpoint.model_checkpoint_path)
    #else:
    #   print("Could not find old network weights")

    epsilon = INITIAL_EPSILON
    t = 0
    while "pigs" != "fly":
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            x_t1_col, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))
            # print('maxreadout is', np.max(readout_j1_batch[i]),'y_batch is', (y_batch[i]),'r_batch is', (r_batch[i]))
            # print('y_batch is', (y_batch[i]))
            # print('y_batch is', (r_batch[i]))

            # perform gradient step
            train_stepe, coste, readout_actione, h_fc11 = sess.run(
                [train_step, cost, readout_action, h_fc1],
                feed_dict={
                    y: y_batch,
                    a: a_batch,
                    s: s_j_batch
                })
            #mprint('cost is',coste,'readout_actione is', readout_actione,'NetworkWeight',np.asarray(h_fc11[:,:,0,0]))
            #xdata.append(t)
            #reworddata.append(coste)
            #line.set_xdata(xdata)
            #line.set_ydata(reworddata)
            #plt.draw()
        # plt.pause(1e-200)

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        #if t % 10000 == 0:
        #   saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon,
              "/ ACTION", action_index, "/ REWARD", r_t,
              "/ Q_MAX %e" % np.max(readout_t))

        # write info to files
        '''
Exemple #5
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    h_file = open("logs_" + GAME + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    #saver.restore(sess, "/tmp/model.ckpt")

    epsilon = INITIAL_EPSILON

    # observe state
    for t in range(int(OBSERVE)):
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = random.randrange(ACTIONS)
        a_t[action_index] = 1

        # run the selected action and observe next state and reward
        x_t1_col, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # update the old values
        s_t = s_t1

        # print info
        state = "observe"

        print ("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))

    t = 0

    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict = {s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0

        if random.random() <= epsilon:
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observe next state and reward
        x_t1_col, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # sample a minibatch to train on
        minibatch = random.sample(D, BATCH)

        # get the batch variables
        s_j_batch = [d[0] for d in minibatch]
        a_batch = [d[1] for d in minibatch]
        r_batch = [d[2] for d in minibatch]
        s_j1_batch = [d[3] for d in minibatch]

        y_batch = []
        readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
        for i in range(0, len(minibatch)):
            # if terminal only equals reward
            if minibatch[i][4]:
                y_batch.append(r_batch[i])
            else:
                y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) #todo: ? not reward

        # perform gradient step
        train_step.run(feed_dict = {
            y : y_batch,
            a : a_batch,
            s : s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, "/save/model-" + str(t) + ".ckpt")

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        #print ("TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
        print ("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))

        # write info to files
        '''
def trainNetwork(s, readout, h_fc1, sess):
    tick = time.time()
    a = tf.placeholder("float", [None, actions])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    game_state = game.GameState()
    # past 3 wins
    win_score = []
    win_score.append(0)
    win_score.append(0)
    win_score.append(0)
    win_score.append(0)
    # store the previous observations in replay memory
    D = deque()
    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(actions)
    do_nothing[0] = 1
    x_t, r_0, terminal, bar1_score, bar2_score = game_state.frame_step(
        do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    observe = 500.
    explore = 500.
    FINAL_EPSILON = 0.05
    INITIAL_EPSILON = 1.0
    epsilon = INITIAL_EPSILON
    t = 0
    K = 1
    while True:
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([actions])
        action_index = 0
        if random.random() <= epsilon or t <= observe:
            action_index = random.randrange(actions)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        if epsilon > FINAL_EPSILON and t > observe:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / explore

        for i in range(0, K):
            x_t1_col, r_t, terminal, bar1_score, bar2_score = game_state.frame_step(
                a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2)

            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > replay_memory:
                D.popleft()

        if t > observe:
            minibatch = random.sample(D, batch)

            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   gamma * np.max(readout_j1_batch[i]))

            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        s_t = s_t1
        t += 1
        if r_t != 0:
            print("Timestep", t, " Score", bar1_score)

        win_score.pop(0)
        win_score.append(bar1_score - bar2_score)
        if (np.matrix(win_score).sum() > 72):  #72
            print("Game_Ends_in Time:", int(time.time() - tick))
            break
def trainNetwork(s, readout_net1, readout_net2, readout_netb1, readout_netb2,
                 sess):
    # define the cost function
    [train_step_net1, y_net1, a_net1] = getTrainStep(readout_net1)
    [train_step_net2, y_net2, a_net2] = getTrainStep(readout_net2)

    [train_step_netb1, y_netb1, a_netb1] = getTrainStep(readout_netb1)
    [train_step_netb2, y_netb2, a_netb2] = getTrainStep(readout_netb2)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open("logs_" + args.game_log_name + "/readout.txt", 'w')
    h_file = open("logs_" + args.game_log_name + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(args.action_count)
    do_nothing[0] = 1
    x_t, r_0, r_1, terminal = game_state.frame_step(do_nothing, do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    if (args.figure):
        fig = plt.figure()
        plt.imshow(x_t.T)
        fig.savefig(args.figure_name)

    import time
    #time.sleep(5)
    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    print checkpoint.model_checkpoint_path
    #saver.restore(sess, checkpoint.model_checkpoint_path)
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print "Successfully loaded:", checkpoint.model_checkpoint_path
    else:
        print "Could not find old network weights"

    epsilon = args.initial_epsilon
    t = 0

    net_flag = 0
    cnt = 0
    while True:
        # choose an action epsilon greedily
        if net_flag == 0:
            readout_t = readout_net1.eval(feed_dict={s: [s_t]})[0]
            readout_bt = readout_netb1.eval(feed_dict={s: [s_t]})[0]

        else:
            readout_t = readout_net2.eval(feed_dict={s: [s_t]})[0]
            readout_bt = readout_netb2.eval(feed_dict={s: [s_t]})[0]

        a_t = np.zeros([args.action_count])
        a_bt = np.zeros([args.action_count])

        action_index = 0
        if random.random() <= epsilon or t <= args.observation_count:
            action_index = random.randrange(args.action_count)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        action_indexb = 0
        if random.random() <= epsilon or t <= args.observation_count:
            action_indexb = random.randrange(args.action_count)
            a_bt[action_indexb] = 1
        else:
            action_indexb = np.argmax(readout_bt)
            a_bt[action_indexb] = 1

        # scale down epsilon
        if epsilon > args.final_epsilon and t > args.observation_count:
            epsilon -= (args.initial_epsilon -
                        args.final_epsilon) / args.explore_frames

        for i in range(0, args.k):
            # run the selected action and observe next state and reward
            x_t1_col, r_t, r_bt, terminal = game_state.frame_step(a_t, a_bt)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(s_t[:, :, 1:], x_t1, axis=2)
            # store the transition in D
            '''
            if t==5:
                fig1 = plt.figure()
                plt.imshow(s_t[:,:,0].T)
                fig1.savefig('trrr_1.png')

                fig2 = plt.figure()
                plt.imshow(s_t[:,:,1].T)
                fig2.savefig('trrr_2.png')

                fig3 = plt.figure()
                plt.imshow(s_t[:,:,2].T)
                fig3.savefig('trrr_3.png')

                fig4 = plt.figure()
                plt.imshow(s_t[:,:,3].T)
                fig4.savefig('trrr_4.png')

                time.sleep(5)
            '''
            D.append((s_t, a_t, r_t, a_bt, r_bt, s_t1, terminal))
            if len(D) > args.replay_memory:
                D.popleft()

        # only train if done observing
        if t > args.observation_count:

            cnt = cnt + 1
            # sample a minibatch to train on
            '''
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]

            ab_batch = [d[3] for d in minibatch]
            rb_batch = [d[4] for d in minibatch]

            s_j1_batch = [d[5] for d in minibatch]

            y_batch = []
            yb_batch = []

            if net_flag == 0:
                readout_j1_batch = readout_net2.eval(feed_dict = {s : s_j1_batch})
                readoutb_j1_batch = readout_netb2.eval(feed_dict = {s : s_j1_batch})
            else:
                readout_j1_batch = readout_net1.eval(feed_dict = {s : s_j1_batch})
                readoutb_j1_batch = readout_netb1.eval(feed_dict = {s : s_j1_batch})

            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][6]:
                    y_batch.append(r_batch[i])
                    yb_batch.append(rb_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))
                    yb_batch.append(rb_batch[i] + GAMMA * np.max(readoutb_j1_batch[i]))

            # perform gradient step
            if net_flag == 0:
                train_step_net2.run( feed_dict = {y_net2 : y_batch,a_net2 : a_batch, s : s_j_batch} )
                train_step_netb2.run( feed_dict = {y_netb2 : yb_batch,a_netb2 : ab_batch, s : s_j_batch} )
            else:
                train_step_net1.run( feed_dict = {y_net1 : y_batch,a_net1 : a_batch, s : s_j_batch} )
                train_step_netb1.run( feed_dict = {y_netb1 : yb_batch,a_netb1 : ab_batch, s : s_j_batch} )
            '''
            # update the old values
            if cnt % args.switch_net == 0:
                if net_flag == 0:
                    net_flag = 1
                else:
                    net_flag = 0
                #print 'SwitchState'
        s_t = s_t1
        t += 1

        # print info
        state = ""
        if t <= args.observation_count:
            state = "observe"
        elif t > args.observation_count and t <= args.observation_count + args.explore_frames:
            state = "explore"
        else:
            state = "train"
        #if r_t != 0:
        #    print "TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)

        if r_bt != 0:
            print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_indexb, "/ REWARD", r_bt, "/ Q_MAX %e" % np.max(
                readout_bt)

        if r_t != 0:
            print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(
                readout_t)
        # write info to files
        '''
        if t % 10000 <= 100:
            a_file.write(",".join([str(x) for x in readout_t]) + '\n')
            h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n')
            cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1)
        '''
        if (t and t % args.terminate_prompt == 0):
            print "TIMESTEP = " + str(t)
            if (yes_or_no("Want to terminate the code")):
                return
            else:
                continue
def trainNetwork(s, readout, h_fc1, sess, merged, writer):

    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    # readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1)
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open(out_put_path + "/readout.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state(store_network_path)

    #saver.restore(sess, "new_networks/pong-dqn-"+str(pretrain_number))

    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        #saver.restore(sess, "my_networks/pong-dqn-26000")
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    print("Press any key and Enter to continue:")
    # raw_input()

    epsilon = INITIAL_EPSILON
    t = 0
    total_score = 0
    positive_score = 0
    while True:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]

        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            x_t1_col, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2)

            total_score = total_score + r_t

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        if r_t == 1:
            positive_score = positive_score + r_t

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess,
                       store_network_path + GAME + '-dqn',
                       global_step=t + pretrain_number)

            #saver.save(sess, 'new_networks/' + GAME + '-dqn', global_step = t)

        if t % 500 == 0:
            now = datetime.datetime.now()
            diff_seconds = (now - start).seconds
            time_text = sencond2time(diff_seconds)

            result = sess.run(merged, feed_dict={s: [s_t]})
            writer.add_summary(result, t + pretrain_number)
            a_file.write(str(t+pretrain_number)+','+",".join([str(x) for x in readout_t]) + \
            ','+str(total_score)+ ','+str(positive_score) \
            +','+time_text+'\n')

        # print info

        print ("TIMESTEP:", t+pretrain_number, "/ ACTION:", action_index, "/ REWARD:", r_t, "/ Q_MAX: %e" % np.max(readout_t),'  time:(H,M,S):' \
        + sencond2time((datetime.datetime.now()-start).seconds))
        print('Total score:', total_score, ' Positive_score:', positive_score,
              '   up:', readout_t[0], '    down:', readout_t[1], '  no:',
              readout_t[2])
Exemple #9
0
def trainNetwork(model, args):
    # open up a game state to communicate with emulator

    game_state = game.GameState()
    # store the previous observations in replay memory
    D = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal, _ = game_state.frame_step(do_nothing)

    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t, (80, 80))
    x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255))

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=0)

    # In Keras, need to reshape
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

    learning_mode = 0  # 2 for learng based on human, 3 for reverse reinforcement
    if args['mode'] == 'Run':
        OBSERVE = 999999999  # We keep observe, never train
        epsilon = FINAL_EPSILON
        print("Now we load weight")
        model.load_weights("model1.h5")
        adam = Adam(lr=1e-6)
        model.compile(loss='mse', optimizer=adam)
        print("Weight load successfully")
        training_mode = False  # running
        os.mkdir("pic", 0755)
        a_file = open("logs_" + GAME + "/logfile_" + str(counter) + ".txt",
                      'a')
    else:  # We go to training mode
        OBSERVE = OBSERVATION
        epsilon = INITIAL_EPSILON
        learning_mode = int(args['learning_mode'])

        if os.path.isfile("model.h5"):  # check if file exists.
            model.load_weights("model.h5")
            adam = Adam(lr=1e-6)
            model.compile(loss='mse', optimizer=adam)
            print("Weight load successfully")

        # printing log file
        training_mode = True  # training

    j = 1
    os.mkdir("pic/" + str(j), 0755)
    t = 0
    pic_counter = 0

    while (True):
        loss = 0
        Q_sa = 0
        action_index = 0
        r_t = 0
        a_t = np.zeros([ACTIONS])

        # choose an action epsilon greedy
        if t % FRAME_PER_ACTION == 0:
            if not training_mode:  # running
                q = model.predict(
                    s_t)  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index = max_Q
                a_t[action_index] = 1

        # We reduced the epsilon gradually
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observed next state and reward
        x_t1_colored, r_t, terminal, score = game_state.frame_step(a_t)

        game_over = terminal

        x_t1 = skimage.color.rgb2gray(x_t1_colored)

        if (score >= 0):
            fig1 = plt.figure(pic_counter)
            plt.imshow(x_t1_colored)
            print('time now: ', datetime.datetime.now())
        fig1.savefig('pic/' + str(j) + '/' + str(pic_counter) +
                     'colored pic.png')

        plt.close()

        x_t1 = skimage.transform.resize(x_t1, (80, 80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))
        x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1])
        s_t1 = np.append(x_t1, s_t[:, :3, :, :], axis=1)

        # store the transition in D
        D.append((s_t, action_index, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        s_t = s_t1
        t = t + 1
        pic_counter += 1

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        if (game_over):
            j = j + 1
            os.mkdir("pic/" + str(j), 0755)
            print(j, "score: ", score, file=a_file)
            a_file.flush()
            pic_counter = 0

    print("Episode finished!")
    print("************************")
Exemple #10
0
def trainNetwork(model1, model2, args):
    player1_wins_in_a_row = 0
    player2_wins_in_a_row = 0

    player1_num_of_trains = 0
    player2_num_of_trains = 0

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D1 = deque()
    D2 = deque()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t1, r_0, terminal, _, _ = game_state.frame_step(do_nothing, do_nothing)

    x_t1 = skimage.color.rgb2gray(x_t1)
    x_t1 = skimage.transform.resize(x_t1, (80, 80))
    x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

    x_t2 = np.flipud(x_t1)

    player1_curr_state = np.stack((x_t1, x_t1, x_t1, x_t1), axis=0)
    player1_curr_state = player1_curr_state.reshape(
        1, player1_curr_state.shape[0], player1_curr_state.shape[1],
        player1_curr_state.shape[2])

    player2_curr_state = np.stack((x_t2, x_t2, x_t2, x_t2), axis=0)
    player2_curr_state = player2_curr_state.reshape(
        1, player2_curr_state.shape[0], player2_curr_state.shape[1],
        player2_curr_state.shape[2])

    #training mode
    OBSERVE = OBSERVATION
    epsilon = INITIAL_EPSILON

    #moving old trials to old_trials folder
    if os.path.exists("trials_simultaneously"):
        copytree("trials_simultaneously", "old_trials_simultaneously")
        shutil.rmtree("trials_simultaneously")

    os.mkdir("trials_simultaneously", 0755)
    learning_mode = int(args['learning_mode'])  #which player learns

    if os.path.isfile("model1.h5"):
        model1.load_weights("model1.h5")

    if os.path.isfile("model2.h5"):
        model2.load_weights("model2.h5")

    adam = Adam(lr=1e-6)
    model1.compile(loss='mse', optimizer=adam)
    model2.compile(loss='mse', optimizer=adam)

    print("Weights loaded successfully")

    training_mode = True  # training

    observation_counter = 0
    num_folder = 0
    start_time = datetime.datetime.now()

    observation_counter = 0

    while (True):
        loss1 = 0
        loss2 = 0
        Q_sa1 = 0
        action_index1 = 0
        r_t1 = 0
        loss2 = 0
        Q_sa2 = 0
        action_index2 = 0
        r_t2 = 0

        a_t1 = np.zeros([ACTIONS])
        a_t2 = np.zeros([ACTIONS])

        #choose an action epsilon greedy
        if (observation_counter % FRAME_PER_ACTION) == 0:

            if random.random() <= epsilon:  # for flayer1
                action_index1 = random.randrange(ACTIONS)
                a_t1[action_index1] = 1

            else:
                q = model1.predict(
                    player1_curr_state
                )  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index1 = max_Q
                a_t1[action_index1] = 1

            if random.random() <= epsilon:  # for flayer2
                action_index2 = random.randrange(ACTIONS)
                a_t2[action_index2] = 1

            else:
                q = model2.predict(
                    player2_curr_state
                )  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index2 = max_Q
                a_t2[action_index2] = 1

        #We reduced the epsilon gradually
        if (epsilon > FINAL_EPSILON) and (observation_counter > OBSERVE):
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observed next state and reward
        x_t1_colored, r_t1, terminal, score, _ = game_state.frame_step(
            a_t1, a_t2)
        r_t2 = r_t1 * (-1)
        game_over = terminal

        x_t1_grey = skimage.color.rgb2gray(x_t1_colored)
        thresh = threshold_otsu(x_t1_grey)
        x_t1 = x_t1_grey > thresh  # binary image
        x_t2 = np.flipud(x_t1)

        x_t1 = skimage.transform.resize(x_t1, (80, 80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

        x_t2 = skimage.transform.resize(x_t2, (80, 80))
        x_t2 = skimage.exposure.rescale_intensity(x_t2, out_range=(0, 255))

        x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1])
        x_t2 = x_t2.reshape(1, 1, x_t2.shape[0], x_t2.shape[1])

        player1_next_state = np.append(x_t1,
                                       player1_curr_state[:, :3, :, :],
                                       axis=1)
        player2_next_state = np.append(x_t2,
                                       player2_curr_state[:, :3, :, :],
                                       axis=1)

        # store the transition in D
        D1.append((player1_curr_state, action_index1, r_t1, player1_next_state,
                   terminal))
        if len(D1) > REPLAY_MEMORY:
            D1.popleft()

        D2.append((player2_curr_state, action_index2, r_t2, player2_next_state,
                   terminal))
        if len(D2) > REPLAY_MEMORY:
            D2.popleft()

        if observation_counter > OBSERVE:
            # sample a minibatch to train on
            minibatch1 = random.sample(D1, BATCH)
            minibatch2 = random.sample(D2, BATCH)

            inputs1 = np.zeros((BATCH, IMAGE_DEPTH, IMAGE_WIDTH,
                                IMAGE_HEIGHT))  # 32, 4, 80, 80
            targets1 = np.zeros((BATCH, ACTIONS))  # 32, 2

            inputs2 = np.zeros((BATCH, IMAGE_DEPTH, IMAGE_WIDTH,
                                IMAGE_HEIGHT))  # 32, 4, 80, 80
            targets2 = np.zeros((BATCH, ACTIONS))  # 32, 2

            # Now we do the experience replay
            for i in range(0, len(minibatch1)):
                curr_state_t1 = minibatch1[i][0]
                action_t1 = minibatch1[i][1]  # This is action index
                reward_t1 = minibatch1[i][2]
                next_state_t1 = minibatch1[i][3]
                terminal1 = minibatch1[i][4]
                # if terminated, only equals reward

                inputs1[i:i + 1] = curr_state_t1  # I saved down s_t1

                targets1[i] = model1.predict(
                    curr_state_t1)  # Hitting each buttom probability
                Q_sa1 = model1.predict(curr_state_t1)

                if terminal1:
                    targets1[i, action_t1] = reward_t1
                else:
                    targets1[i, action_t1] = reward_t1 + GAMMA * np.max(Q_sa1)

            loss1 += model1.train_on_batch(inputs1, targets1)

            # Now we do the experience replay
            for i in range(0, len(minibatch2)):
                curr_state_t2 = minibatch2[i][0]
                action_t2 = minibatch2[i][1]  # This is action index
                reward_t2 = minibatch2[i][2]
                next_state_t2 = minibatch2[i][3]
                terminal2 = minibatch2[i][4]
                # if terminated, only equals reward

                inputs2[i:i + 1] = curr_state_t2

                targets2[i] = model2.predict(
                    curr_state_t2)  # Hitting each buttom probability
                Q_sa2 = model2.predict(curr_state_t2)

                if terminal2:
                    targets2[i, action_t2] = reward_t2
                else:
                    targets2[i, action_t2] = reward_t2 + GAMMA * np.max(Q_sa2)

            loss2 += model2.train_on_batch(inputs2, targets2)

        player1_curr_state = player1_next_state
        player2_curr_state = player2_next_state
        observation_counter = observation_counter + 1

        # save progress every 10000 iterations
        if observation_counter % 100 == 0:
            #print("Now we save model")

            if learning_mode == 1:
                model1.save_weights("model1.h5", overwrite=True)
                with open("model1.json", "w") as outfile1:
                    json.dump(model1.to_json(), outfile1)

            elif learning_mode == 2:
                model2.save_weights("model2.h5", overwrite=True)
                with open("model2.json", "w") as outfile2:
                    json.dump(model2.to_json(), outfile2)

        current_time = datetime.datetime.now()
        elapsedTime = (current_time - start_time).total_seconds()

        if (elapsedTime >= 30 * 60):
            num_folder += 1
            start_time = datetime.datetime.now()

            os.makedirs(
                "trials_simultaneously/" + "player" + str(1) + "learning" +
                "/" + str(num_folder), 0755)

            shutil.copy2(
                'model1.h5', "trials_simultaneously/" + "player" + str(1) +
                "learning" + "/" + str(num_folder) + '/model1.h5')

            os.makedirs(
                "trials_simultaneously/" + "player" + str(2) + "learning" +
                "/" + str(num_folder), 0755)

            shutil.copy2(
                'model2.h5', "trials_simultaneously/" + "player" + str(2) +
                "learning" + "/" + str(num_folder) + '/model2.h5')

        if (game_over):
            if score[0] < score[1]:
                player2_wins_in_a_row = player2_wins_in_a_row + 1
                player1_wins_in_a_row = 0
                percentage = 0.0
            elif score[1] < score[0]:
                player1_wins_in_a_row = player1_wins_in_a_row + 1
                player2_wins_in_a_row = 0
                percentage = 1.0
            else:
                percentage = (score[0] / float((score[0] + score[1])))

    print("Episode finished!")
    print("************************")
def trainNetwork(s, coeff, readout, sess):
    tick = time.time()
    # define the cost function
    a = tf.placeholder("float", [None,actions])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices = 1)
    cost = tf.reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()
    # store the previous observations in replay memory
    replay_memory = 100000
    D = deque()
    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(actions)
    do_nothing[0] = 1
    x_t, r_0, terminal, bar1_score, bar2_score = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2)  
    # saving and loading networks
    saver = tf.train.Saver()
    #sess.run(tf.initialize_all_variables())
    sess.run(tf.global_variables_initializer())
    b_IJ1 = np.zeros((1, 1152, 10, 1, 1)).astype(np.float32) # batch_size=1
    b_IJ2 = np.zeros((batch, 1152, 10, 1, 1)).astype(np.float32) # batch_size=BATCH
    FINAL_EPSILON = 0.05 
    INITIAL_EPSILON = 1.0 
    epsilon = INITIAL_EPSILON
    t = 0
    episode = 0
    OBSERVE = 1000
    EXPLORE = 5000
    while True:
        
        readout_t = readout.eval(feed_dict = {s:s_t.reshape((1,84,84,4)), coeff:b_IJ1})
        
        a_t = np.zeros([actions])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(actions)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        K = 1 
        for i in range(0, K):
            x_t1_col, r_t, terminal, bar1_score, bar2_score = game_state.frame_step(a_t)
            if(terminal == 1):
                episode +=1
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (84, 84)), cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (84, 84, 1))
            s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2)

            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()
        
        if t > OBSERVE and t%train_freq==0:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s:s_j1_batch, coeff:b_IJ2 })
            for i in range(0, len(minibatch)):
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + gamma * np.max(readout_j1_batch[i]))

            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch,
                coeff: b_IJ2})

        s_t = s_t1
        t += 1

        if r_t!= 0:
            print ("Timestep", t"/ Score", bar1_score)

        if(bar1_score - bar2_score > 17): 
            print("Game_Ends_in Time:",int(time.time() - tick))
            break;   
        if(bar1_score - bar2_score > 15):
            print("Game_Mid_in Time:",int(time.time() - tick))
Exemple #12
0
def play_game(left_player):

    # open up a game state to communicate with emulator
    game_state = game.GameState()
    time.sleep(2)
    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)

    # Prevents error in frame step. Both bars stay in place.
    do_nothing[0] = 1
    game_image_data, _, r_0, terminal, _, _ = game_state.frame_step(
        do_nothing, do_nothing)

    # image processing
    game_image_data = skimage.color.rgb2gray(game_image_data)
    game_image_data = skimage.transform.resize(game_image_data, (80, 80))
    game_image_data = skimage.exposure.rescale_intensity(game_image_data,
                                                         out_range=(0, 255))

    for i in range(80):  # erasing the line in the right side of the screen
        game_image_data[79, i] = 0

    # initiating first 4 frames to the same frame
    last_4_frames = np.stack(
        (game_image_data, game_image_data, game_image_data, game_image_data),
        axis=0)

    # In Keras, need to reshape
    last_4_frames = last_4_frames.reshape(1, last_4_frames.shape[0],
                                          last_4_frames.shape[1],
                                          last_4_frames.shape[2])

    number_of_games = 3

    while number_of_games > 0:
        actions_vector1 = np.zeros([ACTIONS])
        actions_vector2 = np.zeros([ACTIONS])

        # choose an action for the left player:
        q1 = left_player.model.predict(
            last_4_frames)  # input a stack of 4 images, get the prediction
        max_Q1 = np.argmax(q1)
        action_index1 = max_Q1
        actions_vector1[action_index1] = 1

        # action for right player: input from a human - keyboard
        # events = pygame.event.get()
        # for event in events:
        #     if event.type == pygame.KEYDOWN:
        #         if event.key == pygame.K_UP:
        #             #action_index2 = 1
        #             actions_vector2[1] = 1
        #         if event.key == pygame.K_DOWN:
        #             #action_index2 = 2
        #             actions_vector2[2] = 1
        #         if event.key == pygame.K_ESCAPE:
        #             exit()

        keys = pygame.key.get_pressed()  # checking pressed keys
        if keys[pygame.K_UP]:
            actions_vector2[1] = 1
        if keys[pygame.K_DOWN]:
            actions_vector2[2] = 1

        #actions_vector2[action_index2] = 1

        # in order for us to see the game
        image_data_colored1, _, _, terminal, score, no_learning_time =\
            game_state.frame_step(actions_vector1, actions_vector2)

        game_over = terminal
        if game_over:
            time.sleep(5)

        # image processing
        x_t1 = skimage.color.rgb2gray(image_data_colored1)
        x_t1 = skimage.transform.resize(x_t1, (80, 80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

        for i in range(80):  # erasing the line in the right side of the screen
            x_t1[79, i] = 0

        x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1])
        last_4_frames1 = np.append(x_t1, last_4_frames[:, :3, :, :], axis=1)

        last_4_frames = last_4_frames1

    print('game over!\n')
Exemple #13
0
def trainNetwork(s, readout, h_fc1, sess):
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    ## Loss Function
    readout_action = tf.reduce_sum(tf.multiply(readout, a),
                                   reduction_indices=1)  #Usar como Q?
    cost = tf.reduce_mean(tf.square(y - readout_action))
    ## If I had to use PSO,GA it would be here in order to minimize the cost and update the weights.
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    ##    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    ##    h_file = open("logs_" + GAME + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)

    # saving and loading networks
    saver = tf.train.Saver()  #create a saver
    sess.run(tf.initialize_all_variables())

    ##checkpoint = tf.train.get_checkpoint_state("saved_networks")

    path_c = "C:/Users/marco/Desktop/Documentos/DeepReinforcedLearning\/DeepLearningVideoGames-master/saved_networks"

    # Correction for Loading TF session
    checkpoint = tf.train.latest_checkpoint(path_c,
                                            latest_filename="checkpoint")

    if checkpoint:
        saver.restore(sess, checkpoint)
        print("Successfully loaded:")
        print(checkpoint)
    else:
        print("Could not find old network weights")

    epsilon = INITIAL_EPSILON
    t = 0
    while t < 100000:  #"pigs" != "fly":
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s: [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        for i in range(0, K):
            # run the selected action and observe next state and reward
            x_t1_col, r_t, terminal = game_state.frame_step(a_t)
            x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)),
                                cv2.COLOR_BGR2GRAY)
            ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
            x_t1 = np.reshape(x_t1, (80, 80, 1))
            s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2)

            # store the transition in D
            D.append((s_t, a_t, r_t, s_t1, terminal))
            if len(D) > REPLAY_MEMORY:
                D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                # if terminal only equals reward
                if minibatch[i][4]:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] +
                                   GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch})

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn2', global_step=t)

            # print info
            state = ""
            if t <= OBSERVE:
                state = "observe"
            elif t > OBSERVE and t <= OBSERVE + EXPLORE:
                state = "explore"
            else:
                state = "train"
            print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon,
                  "/ ACTION", action_index, "/ REWARD", r_t,
                  "/ Q_MAX %e" % np.max(readout_t))
Exemple #14
0
def actorLearner(num, sess, lock):
    # We use global shared O parameter vector
    # We use global shared Otarget parameter vector
    # We use global shared counter T, and TMAX constant
    global TMAX, T

    # Open up a game state to communicate with emulator
    lock.acquire()
    game_state = game.GameState()
    lock.release()

    # Initialize network gradients
    s_j_batch = []
    a_batch = []
    y_batch = []

    # Get the first state by doing nothing and preprocess the image to 80x80x4
    lock.acquire()
    x_t, r_0, terminal = game_state.frame_step([1, 0, 0])
    lock.release()
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    aux_s = s_t

    time.sleep(3 * num)

    # Initialize target network weights
    copyTargetNetwork(sess)

    epsilon_index = random.randrange(EPSILONS)
    INITIAL_EPSILON = INITIAL_EPSILONS[epsilon_index]
    FINAL_EPSILON = FINAL_EPSILONS[epsilon_index]
    epsilon = INITIAL_EPSILON

    print "THREAD ", num, "STARTING...", "EXPLORATION POLICY => INITIAL_EPSILON:", INITIAL_EPSILON, ", FINAL_EPSILON:", FINAL_EPSILON

    # Initialize thread step counter
    t = 0
    score = 0
    while T < TMAX and score < WISHED_SCORE:

        # Choose an action epsilon greedily
        readout_t = O_readout.eval(session=sess, feed_dict={s: [s_t]})
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if random.random() <= epsilon or t <= OBSERVE:
            action_index = random.randrange(ACTIONS)
            a_t[action_index] = 1
        else:
            action_index = np.argmax(readout_t)
            a_t[action_index] = 1

        # Scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # Run the selected action and observe next state and reward
        lock.acquire()
        x_t1_col, r_t, terminal = game_state.frame_step(a_t)
        lock.release()
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        aux_s = np.delete(s_t, 0, axis=2)
        s_t1 = np.append(aux_s, x_t1, axis=2)

        # Accumulate gradients
        readout_j1 = Ot_readout.eval(session=sess, feed_dict={st: [s_t1]})
        if terminal:
            y_batch.append(r_t)
        else:
            y_batch.append(r_t + GAMMA * np.max(readout_j1))

        a_batch.append(a_t)
        s_j_batch.append(s_t)

        # Update the old values
        s_t = s_t1
        T += 1
        t += 1
        score += r_t

        # Update the Otarget network
        if T % It == 0:
            copyTargetNetwork(sess)

        # Update the O network
        if t % Iasync == 0 or terminal:
            if s_j_batch:
                # Perform asynchronous update of O network
                train_O.run(session=sess,
                            feed_dict={
                                y: y_batch,
                                a: a_batch,
                                s: s_j_batch
                            })

            #Clear gradients
            s_j_batch = []
            a_batch = []
            y_batch = []

        # Save progress every 5000 iterations
        if t % 5000 == 0:
            saver.save(sess,
                       'save_networks_asyn/' + GAME + '-dqn',
                       global_step=t)

        # Print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        if terminal:
            print "THREAD:", num, "/ TIME", T, "/ TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(
                readout_t), "/ SCORE", score
            score = 0
Exemple #15
0
def train_sequentially(left_player, right_player, first_learning_player):
    if os.path.exists('logs'):
        if os.path.exists('old_logs'):
            shutil.rmtree('old_logs')
        os.mkdir('old_logs', 0755)
        copytree('logs', 'old_logs')
        shutil.rmtree('logs')
    os.mkdir('logs', 0755)

    # moving old trials to old_trials folder
    if os.path.exists('trials_sequentially'):
        if os.path.exists('old_trials_sequentially'):
            shutil.rmtree('old_trials_sequentially')
        os.mkdir('old_trials_sequentially', 0755)
        copytree('trials_sequentially', 'old_trials_sequentially')
        shutil.rmtree('trials_sequentially')
    os.mkdir('trials_sequentially', 0755)

    current_log_folder = ''
    left_player.num_of_trains = 1
    if (first_learning_player == CurrentPlayer.Left):
        current_training_player = CurrentPlayer.Left
        left_player.num_of_trains += 1
        current_log_folder = 'logs/' + 'left_player' + \
                             '_learning' + str(left_player.num_of_trains)

    elif (first_learning_player == CurrentPlayer.Right):
        current_training_player = CurrentPlayer.Right
        right_player.num_of_trains += 1
        current_log_folder = 'logs/' + 'right_player' + \
                             '_learning' + str(right_player.num_of_trains)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D1 = deque()
    D2 = deque()

    # get the first state (do nothing) and pre-process the image to 4x80x80
    do_nothing = np.zeros(NUM_OF_ACTIONS)
    do_nothing[0] = 1

    single_game_frame, _, _, terminal, _, _ = game_state.frame_step(
        do_nothing, do_nothing)

    single_game_frame = skimage.color.rgb2gray(single_game_frame)
    single_game_frame = skimage.transform.resize(single_game_frame,
                                                 (IMAGE_WIDTH, IMAGE_HEIGHT))
    single_game_frame = skimage.exposure.rescale_intensity(single_game_frame,
                                                           out_range=(0, 255))

    for i in range(80):  # erasing the line in the right side of the screen
        single_game_frame[79, i] = 0

    # stacking up 4 images together to form a state: 4 images = state
    current_state = np.stack((single_game_frame, single_game_frame,
                              single_game_frame, single_game_frame),
                             axis=0)

    current_state = current_state.reshape(1, current_state.shape[0],
                                          current_state.shape[1],
                                          current_state.shape[2])

    epsilon = INITIAL_EPSILON
    observation_counter = 0
    num_folder = 0
    start_time = datetime.datetime.now()
    start_time_loss = datetime.datetime.now()
    losses = []
    episode_number = 0
    exploration_counter = 0
    exploration_flag = 0

    #j = 1
    # os.mkdir("pic/", 0755);
    # t = 0
    # pic_counter = 0

    while True:
        loss = 0
        Q_sa = 0

        # player1 - left player
        action_index1 = 0
        reward1 = 0
        action_left_player = np.zeros([NUM_OF_ACTIONS])

        # player2 - right player
        action_index2 = 0
        reward2 = 0
        action_right_player = np.zeros([NUM_OF_ACTIONS])

        # choose an action epsilon greedy
        if (observation_counter % FRAME_PER_ACTION) == 0:
            if current_training_player == CurrentPlayer.Left:
                if random.random() <= epsilon:
                    action_index1 = random.randrange(NUM_OF_ACTIONS)
                    action_left_player[action_index1] = 1
                else:
                    q = left_player.model.predict(
                        current_state
                    )  # input a stack of 4 images, get the prediction
                    max_Q = np.argmax(q)
                    action_index1 = max_Q
                    action_left_player[action_index1] = 1

                # right player:
                q = right_player.model.predict(
                    current_state
                )  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index2 = max_Q
                action_right_player[action_index2] = 1

            elif current_training_player == CurrentPlayer.Right:
                if random.random() <= epsilon or exploration_flag == 1:
                    action_index2 = random.randrange(NUM_OF_ACTIONS)
                    action_right_player[action_index2] = 1
                else:
                    q = right_player.model.predict(
                        current_state
                    )  # input a stack of 4 images, get the prediction
                    max_Q = np.argmax(q)
                    action_index2 = max_Q
                    action_right_player[action_index2] = 1

                # left player:
                q = left_player.model.predict(
                    current_state
                )  # input a stack of 4 images, get the prediction
                max_Q = np.argmax(q)
                action_index1 = max_Q
                action_left_player[action_index1] = 1

        # we reduced the epsilon gradually
        if (epsilon > FINAL_EPSILON) and (observation_counter > OBSERVATION):
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        single_game_frame_colored, reward1, reward2, terminal, score, _ = \
            game_state.frame_step(action_left_player, action_right_player)

        single_game_frame_gray = skimage.color.rgb2gray(
            single_game_frame_colored)
        thresh = threshold_otsu(single_game_frame_gray)
        single_game_frame = single_game_frame_gray > thresh

        single_game_frame = skimage.transform.resize(single_game_frame,
                                                     (80, 80))
        single_game_frame = skimage.exposure.rescale_intensity(
            single_game_frame, out_range=(0, 255))

        for i in range(80):  # erasing the line in the right side of the screen
            single_game_frame[79, i] = 0

        # if (score >= 0):
        #     fig1 = plt.figure(pic_counter)
        #     # plt.imshow(x_t1_colored)
        #     plt.imshow(single_game_frame)
        #     print('time now: ', datetime.datetime.now())
        # #fig1.savefig('pic/' + str(j) + '/' + str(pic_counter) + 'colored pic.png')
        # fig1.savefig('pic/' + str(pic_counter) + 'colored pic.png')
        #
        # plt.close()
        #
        # t = t + 1
        # pic_counter += 1

        single_game_frame = single_game_frame.reshape(
            1, 1, single_game_frame.shape[0], single_game_frame.shape[1])

        # next 4 images = next state
        next_state = np.append(single_game_frame,
                               current_state[:, :3, :, :],
                               axis=1)

        if current_training_player == CurrentPlayer.Left:
            D1.append(
                (current_state, action_index1, reward1, next_state, terminal))
            if len(D1) > REPLAY_MEMORY:
                D1.popleft()

        elif current_training_player == CurrentPlayer.Right:
            D2.append(
                (current_state, action_index2, reward2, next_state, terminal))
            if len(D2) > REPLAY_MEMORY:
                D2.popleft()

        # only train if done observing
        if observation_counter > OBSERVATION:
            # sample a minibatch to train on - eliminates states correlation
            minibatch = None
            if current_training_player == CurrentPlayer.Left:
                minibatch = random.sample(D1, BATCH)
            elif current_training_player == CurrentPlayer.Right:
                minibatch = random.sample(D2, BATCH)

            inputs = np.zeros(
                (BATCH, current_state.shape[1], current_state.shape[2],
                 current_state.shape[3]))  # 32, 4, 80, 80
            targets = np.zeros((inputs.shape[0], NUM_OF_ACTIONS))  # 32, 2

            # experience replay
            for i in range(0, len(minibatch)):
                current_state_t = minibatch[i][0]
                action_t = minibatch[i][1]  # This is action index
                reward_t = minibatch[i][2]
                next_state_t = minibatch[i][3]
                terminal_t = minibatch[i][4]

                inputs[i:i + 1] = current_state_t

                if (current_training_player == CurrentPlayer.Left):
                    targets[i] = left_player.model.predict(current_state_t)
                    Q_sa = left_player.model.predict(next_state_t)
                elif (current_training_player == CurrentPlayer.Right):
                    targets[i] = right_player.model.predict(current_state_t)
                    Q_sa = right_player.model.predict(next_state_t)

                if terminal_t:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa)

            if (current_training_player == CurrentPlayer.Left):
                loss += left_player.model.train_on_batch(inputs, targets)
            elif (current_training_player == CurrentPlayer.Right):
                loss += right_player.model.train_on_batch(inputs, targets)

                # log_file_loss_path = current_log_folder + '/loss'
                # log_file_qmax_path = current_log_folder + '/qmax'
                # num_of_lines_in_loss_file = 0
                # loss_file = None
                #
                # if not os.path.exists(current_log_folder):
                #     os.mkdir(current_log_folder, 0755)
                #
                # elapsed_time_loss = (current_time - start_time_loss).total_seconds()
                #
                # if elapsed_time_loss >= 1 * 60:
                #     episode_number += 1
                #     start_time_loss = datetime.datetime.now()
                #     loss_file = open(log_file_loss_path, 'a')
                #     num_of_lines_in_loss_file = num_of_lines_in_file(log_file_loss_path)
                #
                #     #with open(log_file_loss_path, 'a') as loss_file:
                #     loss_file.write(str(episode_number) + ' : ' + str(loss) + '\n')
                #     loss_file.flush()
                #     loss_file.close()

        current_state = next_state
        observation_counter = observation_counter + 1

        # save progress (updating weights files) every 100 iterations
        if observation_counter % 100 == 0:
            if current_training_player == CurrentPlayer.Left:
                left_player.model.save_weights('model1.h5', overwrite=True)
                with open('model1.json', 'w') as outfile:
                    json.dump(left_player.model.to_json(), outfile)

            elif current_training_player == CurrentPlayer.Right:
                right_player.model.save_weights('model2.h5', overwrite=True)
                with open('model2.json', 'w') as outfile:
                    json.dump(right_player.model.to_json(), outfile)

        current_time = datetime.datetime.now()
        elapsed_time = (current_time - start_time).total_seconds()

        if elapsed_time >= 5 * 60:
            start_time = datetime.datetime.now()
            num_folder = save_weights_file(num_folder, current_training_player,
                                           left_player, right_player)

        if terminal:  # game over
            if score[0] < score[1]:
                right_player.num_of_wins_in_a_row += 1
                left_player.num_of_wins_in_a_row = 0
            elif score[1] < score[0]:
                left_player.num_of_wins_in_a_row += 1
                right_player.num_of_wins_in_a_row = 0

            print('game ended:\n')
            print('right player wins in a row: ',
                  right_player.num_of_wins_in_a_row, '\n')
            print('left player wins in a row: ',
                  left_player.num_of_wins_in_a_row, '\n')

            if (current_training_player == CurrentPlayer.Left
                    and left_player.num_of_wins_in_a_row == 30):
                _ = save_weights_file(num_folder, current_training_player,
                                      left_player, right_player)
                # plot_loss(current_log_folder, current_log_folder + '/loss')
                # subprocess.call('. ~/flappy/bin/activate && ' + TEST_SEQUENTIAL_COMMAND + ' ' + str(1) +
                #                 ' ' + str(left_player.num_of_trains) + ' ' + str(right_player.num_of_trains),
                #                 shell=True)

                left_player.num_of_wins_in_a_row = 0
                right_player.num_of_wins_in_a_row = 0
                observation_counter = 0
                epsilon = INITIAL_EPSILON
                num_folder = 0
                current_training_player = CurrentPlayer.Right
                right_player.num_of_trains += 1
                current_log_folder = 'logs/' + 'right_player' + \
                                     '_learning' + str(right_player.num_of_trains)
                episode_number = 0
                D1.clear()
                start_time = datetime.datetime.now()
                break

            elif (current_training_player == CurrentPlayer.Right
                  and right_player.num_of_wins_in_a_row == 30):
                # plot_loss(current_log_folder, current_log_folder + '/loss')
                _ = save_weights_file(num_folder, current_training_player,
                                      left_player, right_player)
                # subprocess.call('. ~/flappy/bin/activate && ' + TEST_SEQUENTIAL_COMMAND + ' ' + str(2) +
                #                 ' ' + str(left_player.num_of_trains) + ' ' + str(right_player.num_of_trains),
                #                 shell=True)

                left_player.num_of_wins_in_a_row = 0
                right_player.num_of_wins_in_a_row = 0
                observation_counter = 0
                epsilon = INITIAL_EPSILON
                num_folder = 0
                current_training_player = CurrentPlayer.Left
                left_player.num_of_trains += 1
                current_log_folder = 'logs/' + 'left_player' + \
                                     '_learning' + str(left_player.num_of_trains)
                episode_number = 0
                D2.clear()
                start_time = datetime.datetime.now()
                break

    print("Episode finished!")
    print("************************")
Exemple #16
0
def run_test(learnig_player, left_player, right_player, num_of_test,
             test_player_log_file):
    if not os.path.exists(test_player_log_file):
        os.makedirs(test_player_log_file, 0755)

    test_start_time = datetime.datetime.now()
    no_learning_time = 0
    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)

    # Prevents error in frame step. Both bars stay in place.
    do_nothing[0] = 1
    game_image_data, _, r_0, terminal, _, _ = game_state.frame_step(
        do_nothing, do_nothing)

    # image processing
    game_image_data = skimage.color.rgb2gray(game_image_data)
    game_image_data = skimage.transform.resize(game_image_data, (80, 80))
    game_image_data = skimage.exposure.rescale_intensity(game_image_data,
                                                         out_range=(0, 255))

    for i in range(80):  # erasing the line in the right side of the screen
        game_image_data[79, i] = 0

    # initiating first 4 frames to the same frame
    last_4_frames = np.stack(
        (game_image_data, game_image_data, game_image_data, game_image_data),
        axis=0)

    # In Keras, need to reshape
    last_4_frames = last_4_frames.reshape(1, last_4_frames.shape[0],
                                          last_4_frames.shape[1],
                                          last_4_frames.shape[2])

    num_folder = 0
    left_player_scores = []
    right_player_scores = []
    time_list = []
    game_score = []
    left_player_q_max_list = []
    right_player_q_max_list = []

    number_of_games = 10
    #number_of_games = 10 #for epsilons tests

    original_number_of_games = number_of_games

    game_start_time = datetime.datetime.now()

    left_player.num_of_wins = 0
    right_player.num_of_wins = 0

    while (number_of_games > 0):
        actions_vector1 = np.zeros([ACTIONS])
        actions_vector2 = np.zeros([ACTIONS])

        # choose an action epsilon greedy for player 1:
        q1 = left_player.model.predict(
            last_4_frames)  # input a stack of 4 images, get the prediction
        max_Q1 = np.argmax(q1)
        action_index1 = max_Q1
        q_max1 = np.amax(q1)
        left_player_q_max_list.append((num_folder, q_max1))

        # actions_vector1: input vector for frame_step function. contains the desired action for player 1
        # e.g [0,1,0]- up
        actions_vector1[action_index1] = 1

        # choose an action epsilon greedy for player 2:
        q2 = right_player.model.predict(
            last_4_frames)  # input a stack of 4 images, get the prediction
        max_Q2 = np.argmax(q2)
        action_index2 = max_Q2
        q_max2 = np.amax(q2)
        right_player_q_max_list.append((num_folder, q_max2))

        # actions_vector2: input vector for frame_step function. contains the desired action
        # e.g [0,1,0]- up
        actions_vector2[action_index2] = 1
        with open(test_player_log_file + '/qmax', 'a') as qmax_log_file:
            if learnig_player == 1:
                qmax_log_file.write(
                    str(num_of_test) + ' : ' + str(q_max1) + '\n')
            else:
                qmax_log_file.write(
                    str(num_of_test) + ' : ' + str(q_max2) + '\n')

        # in order for us to see the game
        image_data_colored1, _, _, terminal, score, no_learning_time = game_state.frame_step(
            actions_vector1, actions_vector2)

        game_over = terminal
        if (game_over == True):
            number_of_games -= 1
            print(
                str(datetime.datetime.now()) + " game ended:   " +
                str(number_of_games) + " games left for the test")

            if (score[0] > score[1]):
                left_player.num_of_wins += 1
            else:
                right_player.num_of_wins += 1

            with open(test_player_log_file + "/" + "game_over_log",
                      "a") as game_over_file:
                game_over_file.write("score: " + str(score) +
                                     "   game duration: " +
                                     str((datetime.datetime.now() -
                                          game_start_time).total_seconds() -
                                         no_learning_time) + " [sec]" + "\n")
                game_over_file.flush()

            left_player_scores.append(score[0])
            right_player_scores.append(score[1])
            game_score.append(score)

            current_time = datetime.datetime.now()
            elapsedTime = (current_time -
                           game_start_time).total_seconds() - no_learning_time
            time_list.append(elapsedTime)

            game_start_time = datetime.datetime.now()

        # image processing
        x_t1 = skimage.color.rgb2gray(image_data_colored1)
        x_t1 = skimage.transform.resize(x_t1, (80, 80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255))

        for i in range(80):  # erasing the line in the right side of the screen
            x_t1[79, i] = 0

        x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1])
        last_4_frames1 = np.append(x_t1, last_4_frames[:, :3, :, :], axis=1)

        last_4_frames = last_4_frames1

    if (number_of_games == 0):
        average_time = np.mean(time_list)
        left_player_average_score = np.mean(left_player_scores)
        right_player_average_score = np.mean(right_player_scores)

        print("left_player_num_of_wins: ", left_player.num_of_wins)
        print("\nright_player_num_of_wins: ", right_player.num_of_wins)

        left_player_win_percentage = (left_player.num_of_wins /
                                      float(original_number_of_games)) * 100
        right_player_win_percentage = (right_player.num_of_wins /
                                       float(original_number_of_games)) * 100

        print("\n\nleft_player_win_percentage: ", left_player_win_percentage)
        print("\nright_player_win_percentage: ", right_player_win_percentage)

        with open(test_player_log_file + "/" + "game_summary",
                  "a") as results_file:
            results_file.write("\n" + "Game Summary" + str(num_of_test) + ":" +
                               "\n" + "   left player average score: " +
                               str(left_player_average_score) + " [points]" +
                               "\n" + "   left player win percentage: " +
                               str(left_player_win_percentage) + "%" + "\n" +
                               "   right player average score: " +
                               str(right_player_average_score) + " [points]" +
                               "\n" + "   right player win percentage: " +
                               str(right_player_win_percentage) + "%" + "\n" +
                               "   average time: " + str(average_time) +
                               "[sec]" + "\n" + "\n" + "\n")

    return time_list, game_score, left_player_q_max_list, right_player_q_max_list