Example #1
0
async def play_hand_ai(callback):
    game = pong.PongGame(training_mode = False)
    while True:
        game.play_default_move(pong_constants.PLAYER1)
        game.play_default_move(pong_constants.PLAYER2)
        game.evolve()
        await callback(game)
Example #2
0
async def async_evaluate_performance(session,
                                     graph,
                                     training_mode,
                                     callback=None):

    game = pong.PongGame(training_mode=training_mode)

    def total_points(game):
        return pong_stats.total_points(game.stats)

    while total_points(game) < config.NUM_POINTS_PER_EVALUATION:
        prev_stats = game.stats
        prev_game_state = example.model_state(game.state)
        if config.CHOOSE_BEST_ALWAYS:
            chosen_action_idx = choose_best_action(session, graph,
                                                   prev_game_state)
        else:
            chosen_action_idx = choose_action(session, graph, prev_game_state)
        play_action_idx(game, chosen_action_idx)
        next_stats = game.stats

        if callback:
            await callback(game)

        points_did_change = (pong_stats.total_points(prev_stats) !=
                             pong_stats.total_points(next_stats))
        should_log = (points_did_change
                      and (total_points(game) % config.POINTS_PER_LOG == 0))
        if should_log:
            print(f"eval point #{total_points(game)}")
            print(game.stats)
Example #3
0
def show_game(p1):  # p1 and p2 are both agents
    pygame.init()
    DISP = pygame.display.set_mode((800, 800))
    pygame.display.set_caption('PONG!')

    game = pong.PongGame()

    run = True

    res = True

    move_num = 0
    while run:
        draw(game, DISP)

        # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1
        move_num_left = p1.get_move(game.getState(-1))

        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                run = False

        res = game.transition(move_num_left)

        run = (res == 0)
        pygame.display.update()

    pygame.quit()
Example #4
0
def get_winner(p1, p2):  # p1 and p2 are both agents
    run = True
    game = pong.PongGame()

    res = 0

    cnt = 0

    while run:
        cnt += 1
        # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1
        move_num_left = p1.get_move(game.getState(-1))
        move_num_right = p2.get_move(game.getState(1))

        # for event in pygame.event.get():
        # 	if event.type == pygame.QUIT:
        # 		run = False

        res = game.transition(move_num_left, move_num_right)

        run = (res == 0)

        if (cnt > 1000):
            break

    if cnt > 35:
        print('We exceeded 35 with', cnt)

    return res
Example #5
0
def trainGraph(inp, out, sess):
    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])  #ground truth
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(action - gt))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
    game = pong.PongGame()
    D = deque()
    trash, frame = game.nextFrame([0, 0, 0])
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    inp_t = np.stack((frame, frame, frame, frame), axis=2)
    saver = tf.train.Saver()
    saver.restore(sess, './save/pong-dqn-80000')
    sess.run(tf.initialize_all_variables())
    t = 0
    epsilon = INITIAL_EPSILON
    while (1):
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]
        argmax_t = np.zeros([ACTIONS])
        if (random.random() <= epsilon):
            maxIndex = random.randrange(ACTIONS)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1
        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        reward_t, frame = game.nextFrame(argmax_t)
        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)
        D.append((inp_t, argmax_t, reward_t, inp_t1))
        if len(D) > REPLAY_MEMORY:
            D.popleft()
        if t > OBSERVE:
            minibatch = random.sample(D, BATCH)
            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]
            gt_batch = []
            out_batch = out.eval(feed_dict={inp: inp_t1_batch})
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))
            train_step.run(feed_dict={
                gt: gt_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })
        inp_t = inp_t1
        t = t + 1
        if t % 10000 == 0:
            saver.save(sess, './save/' + 'pong' + '-dqn', global_step=t)
        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Example #6
0
def evaluate_action(game_state, action_idx):
    # Shouldn't matter what mode because we replace the state.
    game = pong.PongGame(training_mode=False)
    # Unpack the game state.
    game.state.paddle1_pos = game_state[0]
    game.state.paddle2_pos = game_state[1]
    game.state.ball_pos = np.array([game_state[2], game_state[3]])
    game.state.ball_vel = np.array([game_state[4], game_state[5]])
    prev_state = game.state
    prev_stats = game.stats
    play_action_idx(game, action_idx)
    next_state = game.state
    next_stats = game.stats

    return example.reward(prev_state, prev_stats, next_state, next_stats)
Example #7
0
def train_epoch(run_info, epoch_idx):
    game = pong.PongGame(training_mode=config.TRAINING_MODE)

    losses = []
    batch_idxs = range(1, config.NUM_BATCHES_PER_EPOCH + 1)
    for batch_idx in batch_idxs:
        bi = batch_info(epoch_idx, batch_idx)

        example.generate_data(run_info, bi, game)
        batch_loss = train_batch(run_info, bi)

        if batch_loss:
            losses.append(batch_loss)
        if batch_idx % config.BATCHES_PER_LOG == 0:
            log_batch(bi, losses, game)
            losses = []
Example #8
0
def fitness(p1):  # p1 and p2 are both agents
    res = 0

    cnt = 0

    run = True
    game = pong.PongGame()
    while run:
        cnt += 1
        # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1
        move_num_left = p1.get_move(game.getState(-1))

        res = game.transition(move_num_left)

        run = (res == 0)

        if (cnt > 10000):
            # print('reached 10000')
            break

    return cnt
Example #9
0
def trainNetwork(inp, out, user_playing, use_model, filename):
    # initialize global variables
    argmax = tf.placeholder("float", [None, ACTIONS])

    # define ground truth
    gt = tf.placeholder("float", [None])

    # globally define a variable to store how many iterations have been done
    global_time = tf.Variable(0, name='global_time')

    # define optimization and cost functions (simple quadratic cost function)
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)
    cost = tf.reduce_mean(tf.square(action - gt))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # initialize game and replay queue
    game = pong.PongGame(user_playing)
    replay = deque()

    # given the frame data, want to convert it to greyscale and crop to 60 x 60
    frame = game.getCurrentFrame()
    frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)

    # create input tensor using numpy to stack four frames
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    # implement saver
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        log_device_placement=True))
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)

    # If filename specified, try and load that
    if filename is not None:
        checkpoint = './saves/' + filename
    else:
        # try and restore latest checkpoint
        checkpoint = tf.train.latest_checkpoint('./saves')

    if checkpoint != None:
        # was able to retrieve save
        saver.restore(sess, checkpoint)
    else:
        # no checkpoint found, initialize new neural network
        init = tf.global_variables_initializer()
        sess.run(init)

    t = global_time.eval()
    numIterations = 0
    epsilon = INITIAL_EPSILON

    while True:
        starttime = time.time()
        # feed the input tensor into the neural network (only one index)
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]

        # create an empty tensor of size ACTIONS
        argmax_t = np.zeros([ACTIONS])

        # if < epsilon then random action, otherwise predicted action
        if random.random() <= epsilon and not use_model:
            index = choice((0, 1, 2), p=(0.9, 0.05, 0.05))
        else:
            index = np.argmax(out_t)

        # update epsilon
        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # set the corresponding position to 1 corresponding to the desired action
        argmax_t[index] = 1

        if use_model:
            mode = 'Model Only'
        elif numIterations > OBSERVE:
            mode = 'Exploring'
        else:
            mode = 'Observing'

        # retrieve next frame
        reward_t, frame = game.getNextFrame(
            argmax_t, [t, np.max(out_t), epsilon, mode])
        frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (60, 60, 1))

        # create a new input tensor by adding the new frame and keeping previous three frames
        newinp_t = np.append(frame, inp_t[:, :, 0:3], axis=2)

        # store experience in queue for later
        replay.append((inp_t, argmax_t, reward_t, newinp_t))

        # if we run out of memory, pop the oldest one and add new experience
        if len(replay) > REPLAY_MEMORY:
            replay.popleft()

        if numIterations > OBSERVE and not use_model:
            # have stored enough experiences, time to train using stored memories!!!!
            training_batch = random.sample(replay, BATCH_SIZE)

            # break it up into seperate lists and create a batch for ground truths
            inp_batch, argmax_batch, reward_batch, newinp_batch, gt_batch = [], [], [], [], []

            for current in training_batch:
                inp_batch.append(current[0])
                argmax_batch.append(current[1])
                reward_batch.append(current[2])
                newinp_batch.append(current[3])

            # create a batch for the outputs (will be used to train)
            out_batch = out.eval(feed_dict={inp: newinp_batch})

            # the ground truth is found using the formula gt = reward + gamma * Qmax (of next frame)
            for i in range(len(training_batch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))

            # training time, feed the batches to the train_step we defined earlier (will go where placeholders were)
            train_step.run(feed_dict={
                argmax: argmax_batch,
                gt: gt_batch,
                inp: inp_batch
            })

        # update the input tensor and increment t and numIterations
        inp_t = newinp_t
        t += 1
        numIterations += 1

        # if it is time to save the state, do so
        if t % SAVE_STEP == 0 and not use_model:
            sess.run(global_time.assign(t))
            saver.save(sess, './saves/model.ckpt', global_step=t)

        endtime = time.time()
        difference = endtime - starttime
        if use_model and difference < (1 / FPS):
            time.sleep((1 / FPS) - difference)

        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", index,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Example #10
0
def trainGraph(inp, out, sess):
    game = pong.PongGame()

    # to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0
    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])  # ground truth

    # action
    prob = tf.nn.softmax(out)
    action = tf.reduce_sum(tf.multiply(prob, argmax), reduction_indices=1)
    # cost function we will reduce through backpropagation
    cost = tf.reduce_mean(tf.multiply(action, gt))  #tf.square(action - gt))
    # optimization fucntion to reduce our minimize our cost function
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # initialize our game

    # create a queue for experience replay to store policies
    Point = deque()
    D = deque()

    # intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    # binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    # stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    # saver
    saver = tf.train.Saver()

    sess.run(tf.initialize_all_variables())

    t = 0
    epsilon = INITIAL_EPSILON

    # training time
    while (1):
        # output tensor
        prob_eval, out_eval = sess.run([prob, out], feed_dict={inp: [inp_t]})
        prob_t = prob_eval[0]
        out_t = out_eval[0]
        # argmax function
        argmax_t = np.zeros([ACTIONS])

        #
        #     maxIndex = np.argmax(out_t[0])
        #       if (random.random() > prob_t[maxIndex]):
        #            maxIndex = random.randrange(ACTIONS)

        #    argmax_t[maxIndex] = 1

        if (random.random() <= epsilon):
            maxIndex = random.randrange(ACTIONS)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # reward tensor if score is positive
        reward_t, frame = game.getNextFrame(argmax_t)
        # get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))
        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)
        win_count = 0
        # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences
        if (reward_t == 0):

            Point.append([inp_t, argmax_t, reward_t, inp_t1])
            if (len(Point) > 300):
                Point.popleft()
        else:
            reward_array = np.zeros(shape=[len(Point) + 1])
            #filling values in the reward array
            for i in reversed(xrange(0, len(Point) + 1)):
                if (i == len(Point)):
                    Point.append([inp_t, argmax_t, reward_t, inp_t1])

                    reward_array[i] = reward_t

                else:

                    reward_array[i] = reward_array[i +
                                                   1] * GAMMA + reward_array[i]
            #adding reward array values back into frames and putting them into frame queue
            reward_array -= np.mean(reward_array)
            reward_array /= np.std(reward_array)

            print("how long did the point last?     ", len(Point))
            for i in range(len(reward_array)):
                QueueTransfer = Point.popleft()
                QueueTransfer[2] = reward_array[i]
                D.append(QueueTransfer)
            if reward_t == 1:
                win_count += 1

        # if we run out of replay memory, make room
        if (len(D) > REPLAY_MEMORY):
            for i in range(0, len(D) - REPLAY_MEMORY):
                D.popleft()

        # training iteration
        if (len(D) == REPLAY_MEMORY):
            print("training")

            # get values from our replay memory
            np.random.shuffle(D)
            inp_batch = []
            argmax_batch = []
            reward_batch = []
            inp_t1_batch = []
            for i in range(len(D)):
                temp = D.pop()
                inp_batch.append(temp[0])
                argmax_batch.append(temp[1])
                reward_batch.append(temp[2])
                inp_t1_batch.append(temp[3])

            c1, a1, o1 = sess.run(
                [cost, action, out],
                feed_dict={
                    gt: [reward_batch[-1]],
                    argmax: [argmax_batch[-1]],
                    inp: [inp_batch[-1]]
                })

            # train on that
            train_step.run(feed_dict={
                gt: reward_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })

        # update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1

        # print our where wer are after saving where we are
        if t % 10000 == 0:
            saver.save(sess, './' + 'pong' + '-dqn', global_step=t)

        print("TIMESTEP", t, "/ ACTION", maxIndex, "Epsilon', ", epsilon,
              "/ REWARD", reward_t, "/ Wins ", win_count,
              "/ Q_MAX %e" % np.max(out_t))
Example #11
0
def main():
    #initialize the game
    game = pong.PongGame()
    
    # get intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
    # binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    # stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis = 2)

    # create and initialize graphs
    g1 = tf.Graph()
    g2 = tf.Graph()

    with g1.as_default():
        inp_A, out_A = createGraph()
        saver_A = tf.train.Saver(tf.global_variables())
        sess_A = tf.Session(graph=g1)
        checkpoint_A = tf.train.latest_checkpoint('./checkpoints_A')
        saver_A.restore(sess_A, checkpoint_A)
        
    with g2.as_default():
        inp_B, out_B = createGraph()
        saver_B = tf.train.Saver(tf.global_variables())
        sess_B = tf.Session(graph=g2)
        checkpoint_B = tf.train.latest_checkpoint('./checkpoints_B')
        saver_B.restore(sess_B, checkpoint_B)
    
    # keep track of timesteps
    t = 0

    while(1):
        # output tensor
        out_t_A = out_A.eval(session=sess_A, feed_dict = {inp_A : [inp_t]})[0]
        out_t_B = out_B.eval(session=sess_B, feed_dict = {inp_B : [inp_t]})[0]
        # argmax function
        argmax_t_A = np.zeros([ACTIONS])
        argmax_t_B = np.zeros([ACTIONS])

        maxIndex_A = np.argmax(out_t_A)
        maxIndex_B = np.argmax(out_t_B)
        argmax_t_A[maxIndex_A] = 1
        argmax_t_B[maxIndex_B] = 1
    
        # reward tensor
        score1, score2, cumScore1, cumScore2, rewardID_player1, rewardID_player2, cumID1, cumID2, \
        rewardSE_player1, rewardSE_player2, cumSE1, cumSE2, frame = game.getNextFrame(argmax_t_A, argmax_t_B)
        
        # get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (60, 60, 1))
        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis = 2)
             
        # update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1   

        # save stats log
        if score1 == 1 or score2 == 1:
            with open('stats_test.txt', 'a') as log:
                scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str(cumScore1) + ' cumScore2 ' + str(cumScore2) \
                + ' ID1 ' + str(rewardID_player1) + ' ID2 ' + str(rewardID_player2) + ' cumID1 ' + str(cumID1) \
                + ' cumID2 ' + str(cumID2) + ' SE1 ' + str(rewardSE_player1) + ' SE2 ' + str(rewardSE_player2) \
                + ' cumSE1 ' + str(cumSE1) + ' cumSE2 ' + str(cumSE2) + '\n'
                log.write(scoreline)
            
        print("TIMESTEP", t, "/ EPSILON", "0")
def trainGraph(inp, out):

    #to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0
    argmax = tf.placeholder("float", [None, ACTIONS]) 
    gt = tf.placeholder("float", [None]) #ground truth
    global_step = tf.Variable(0, name='global_step')


    #action
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices = 1)
    #cost function we will reduce through backpropagation
    cost = tf.reduce_mean(tf.square(action - gt))
    #optimization fucntion to reduce our minimize our cost function 
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    #initialize our game
    game = pong.PongGame()
    
    #create a queue for experience replay to store policies
    D = deque()

    #intial frame
    frame = game.getPresentFrame()
    #convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
    #binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    #stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis = 2)

    #saver
    saver = tf.train.Saver(tf.global_variables())    
    # use a SessionManager to help with automatic variable restoration    
    sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))
    
    checkpoint = tf.train.latest_checkpoint('./checkpoints')
    if checkpoint != None:
        print('Restore Checkpoint %s'%(checkpoint))      
        saver.restore(sess, checkpoint)
        print("Model restored.")   
    else:
        init = tf.global_variables_initializer()
        sess.run(init)
        print("Initialized new Graph")

    t = global_step.eval()   
    c= 0
    
    epsilon = INITIAL_EPSILON
    
    #training time
    while(1):
        #output tensor
        out_t = out.eval(feed_dict = {inp : [inp_t]})[0]
        #argmax function
        argmax_t = np.zeros([ACTIONS])

        #
        if(random.random() <= epsilon and not USE_MODEL):
            # make 0 the most choosen action for realistic randomness
            maxIndex = choice((0,1,2), 1, p=(0.90, 0.05,0.05))
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1
        
        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        mode = 'observing'
        if t > OBSERVE:
            mode = 'training'
        if USE_MODEL:
            mode = 'model only'

        #reward tensor if score is positive
        reward_t, frame = game.getNextFrame(argmax_t, [t, np.max(out_t), epsilon, mode])
        #get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (60, 60, 1))
        #new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis = 2)
        
        #add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences
        D.append((inp_t, argmax_t, reward_t, inp_t1))

        #if we run out of replay memory, make room
        if len(D) > REPLAY_MEMORY:
            D.popleft()
        
        #training iteration
        if c > OBSERVE and not USE_MODEL:

            #get values from our replay memory
            minibatch = random.sample(D, BATCH)
        
            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]
        
            gt_batch = []
            out_batch = out.eval(feed_dict = {inp : inp_t1_batch})
            
            #add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))



            #train on that 
            train_step.run(feed_dict = {
                           gt : gt_batch,
                           argmax : argmax_batch,
                           inp : inp_batch
                           })
        
        #update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1   
        c = c + 1     

        #print our where wer are after saving where we are
        if t % SAVE_STEP == 0 and not USE_MODEL:
            sess.run(global_step.assign(t))            
            saver.save(sess, './checkpoints/model.ckpt', global_step=t)    

        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Example #13
0
def trainAgent(net):

    # initialize our game
    game = pong.PongGame()

    # create a queue for experience replay to store policies
    # and set the maxlength equals to the size of replay memory
    D = deque(maxlen=REPLAY_MEMORY)

    # intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    # binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    # stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    Network = net().cuda()

    optimizer = optim.Adam(Network.parameters(), lr=1e-5)
    criterion = nn.MSELoss()

    writer = SummaryWriter(log_dir='./logs')

    if os.path.exists('./params.pkl'):
        print('Restore from exists model')
        Network.load_state_dict(torch.load('./params.pkl'))
        # TODO: record steps
        steps = 0
    else:
        steps = 0

    expected_epsilon = INITIAL_EPSILON - steps * (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
    if expected_epsilon > FINAL_EPSILON:
        epsilon = expected_epsilon
    else:
        epsilon = FINAL_EPSILON
    total_observe = steps + ADDITIONAL_OB

    # training time
    while(1):
        out_t = Network(to_tensor(inp_t))
        # argmax function
        argmax_t = np.zeros([ACTIONS])

        if random.random() <= epsilon:
            maxIndex = random.randrange(ACTIONS)
        else:
            _, maxIndex = torch.max(out_t, 1)
            maxIndex = maxIndex.cpu().numpy()[0]
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # reward tensor if score is positive
        reward_t, frame, hit_rate, hit_rate_100 = game.getNextFrame(argmax_t)
        # get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))
        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences
        D.append((inp_t, argmax_t, reward_t, inp_t1))

        # training iteration
        if steps > total_observe:

            # get values from our replay memory
            minibatch = random.sample(D, BATCH)
            # minibatch = np.array(minibatch).transpose()

            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]

            gt_batch = []
            # out_batch = out.eval(feed_dict={inp: inp_t1_batch})
            out_prev_batch = Network(to_tensor(inp_batch))
            out_batch = Network(to_tensor(inp_t1_batch))

            # add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch.data.cpu().numpy()[i]))

            # action = np.mean(np.multiply(argmax_batch, out_prev_batch.data.cpu().numpy()), axis=1)
            action = torch.sum(out_prev_batch.mul(torch.FloatTensor(argmax_batch).cuda()), dim=1)
            gt_batch = torch.FloatTensor(reward_batch).cuda() + GAMMA * out_batch.max(1)[0]
            gt_batch = torch.autograd.Variable(gt_batch, requires_grad=False)

            loss = criterion(action, gt_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # update our input tensor the the next frame
        inp_t = inp_t1
        steps += 1

        # record the agent's performance every 100 steps
        if steps % 100 == 0:
            writer.add_scalars('', {'hit_rate': hit_rate,
                                    'hit_rate_100': hit_rate_100}, steps)

        # print our where we are after saving where we are
        if steps % 10000 == 0:
            torch.save(Network.state_dict(), './params.pkl')

        print("TIMESTEP", steps, "/ EPSILON %7.5f" % epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % torch.max(out_t))

        # stop traing after 1M steps
        if steps > 1000000:
            break
Example #14
0
def trainGraph(inp, out):
    # preparation stage - game, frames, saver and checkpoints management
    # to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0
    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])  # ground truth
    global_step = tf.Variable(0, name='global_step')

    # action
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)
    # cost function which we will reduce through backpropagation
    # what's the action ???
    cost = tf.reduce_mean(tf.square(action - gt))
    # optimization function to minimize our cost function
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # initialize the game
    game = pong.PongGame(0)

    # create a queue for experience replay to store policies
    DL = deque()
    #DR = deque()
    # get intial frame
    frame = game.getPresentFrame()

    # stack frames, create the input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    # saver and checkpoints management
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False)
    config.gpu_options.allow_growth = True
    sess = tf.InteractiveSession(config=config)
    #sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True))

    checkpoint = tf.train.latest_checkpoint('./checkpoints')
    if checkpoint != None:
        print('Restore Checkpoint %s' % (checkpoint))
        saver.restore(sess, checkpoint)
        print("Model restored.")
    else:
        init = tf.global_variables_initializer()
        sess.run(init)
        print("Initialized new Graph")

    t = global_step.eval()
    c = 0

    epsilon = INITIAL_EPSILON

    train_side = 0
    # training DQN and exporting stats

    while (1):
        # output tensor
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]
        # argmax function
        argmax_t = np.zeros([ACTIONS])

        # pick action
        if (random.random() <= epsilon and not USE_MODEL):
            maxIndex = random.randrange(ACTIONS)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        if train_side == 1:
            action_tuple = (None, argmax_t)
        else:
            action_tuple = (argmax_t, None)

        # My first goal is to run one agent that can beat the opponent both sides

        frame, rewards, done, cumScores = game.getNextFrame(action_tuple)

        reward_t = rewards[train_side]

        # flip the image data to remian the same view as the left side
        if train_side == 1:
            frame = frame[::-1, :]

        #cv2.imwrite("imgs/%i_train_side.png" % t, frame)

        frame = np.reshape(frame, (60, 60, 1))
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)
        #if train_side == 1:
        #        DR.append((inp_t, argmax_t, reward_t, inp_t1))
        #        if len(DR) > REPLAY_MEMORY:
        #            DR.popleft()
        #else:
        DL.append((inp_t, argmax_t, reward_t, inp_t1))
        if len(DL) > REPLAY_MEMORY:
            DL.popleft()

        if c > OBSERVE and not USE_MODEL:
            #if train_side == 1:
            #    minibatch = random.sample(DR, BATCH)
            #else:
            minibatch = random.sample(DL, BATCH)
            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]

            gt_batch = []
            out_batch = out.eval(feed_dict={inp: inp_t1_batch})

            # add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))

            # train on that
            train_step.run(feed_dict={
                gt: gt_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })

        # update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1
        c = c + 1

        # save checkints
        if t % SAVE_STEP == 0 and not USE_MODEL:
            sess.run(global_step.assign(t))
            saver.save(sess, './checkpoints/' + 'model.ckpt', global_step=t)

        # save stats log
        if done:
            with open('stats_test.txt', 'a') as log:
                scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str(
                    cumScores[train_side]) + ' cumScore2 ' + str(
                        cumScores[1 - train_side]) + ' train side ' + str(
                            train_side) + '\n'
                print(scoreline.strip())
                log.write(scoreline)
            train_side = 1 - train_side
Example #15
0
def trainGraph(inp, out, sess):

    #to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0
    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])  #ground truth

    #action
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)
    #cost function we will reduce through backpropagation
    cost = tf.reduce_mean(tf.square(action - gt))
    #optimization fucntion to reduce our minimize our cost function
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    #initialize our game
    game = pong.PongGame()

    #create a queue for experience replay to store policies
    Point = deque()
    D = deque()

    #intial frame
    frame = game.getPresentFrame()
    #convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    #binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    #stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    #saver
    saver = tf.train.Saver()

    sess.run(tf.initialize_all_variables())

    t = 0
    epsilon = INITIAL_EPSILON

    #training time
    while (1):
        #output tensor
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]
        #argmax function
        argmax_t = np.zeros([ACTIONS])

        #
        if (random.random() <= epsilon):
            maxIndex = random.randrange(ACTIONS)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        #reward tensor if score is positive
        reward_t, frame = game.getNextFrame(argmax_t)
        #get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))
        #new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        #add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences
        if (reward_t == 0):
            Point.append((inp_t, argmax_t, reward_t, inp_t1))
        else:
            framearray = np.empty(shape=(len(Point) + 1), dtype=object)
            for i in range(len(Point) + 1):
                if (i == 0):
                    framearray[i] = (inp_t, argmax_t, reward_t, inp_t1)
                else:
                    temp = Point.pop()
                    framearray[i] = temp
            D.append(framearray)

        #if we run out of replay memory, make room
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        #training iteration
        if len(D) > 0:
            if (BATCH > len(D)):
                currentBATCH = len(D)

            else:
                currentBATCH = BATCH

            #get values from our replay memory
            minibatch = random.sample(D, currentBATCH)
            inp_batch = []
            argmax_batch = []
            reward_batch = []
            inp_t1_batch = []
            for i in range(len(minibatch)):
                for j in range(len(minibatch[i])):
                    inp_batch.append(minibatch[i][j][0])
                    argmax_batch.append(minibatch[i][j][1])
                    reward_batch.append(minibatch[i][j][2])
                    inp_t1_batch.append(minibatch[i][j][3])

            gt_batch = []
            out_batch = out.eval(feed_dict={inp: inp_t1_batch})

            #add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))

            #train on that
            train_step.run(feed_dict={
                gt: gt_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })

        #update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1

        #print our where wer are after saving where we are
        if t % 10000 == 0:
            saver.save(sess, './' + 'pong' + '-dqn', global_step=t)

        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Example #16
0
def trainGraph(inp, out, sess):

    # to calculate the argmax, we multiply the predicted output with a vector
    # with one value 1 and rest as 0
    argmax = tf.placeholder("float", [None, ACTIONS])
    gt = tf.placeholder("float", [None])  # ground truth

    # action
    action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1)
    # cost function we will reduce through backpropagation
    cost = tf.reduce_mean(tf.square(action - gt))
    # optimization function to reduce our minimize our cost function
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # initialize our game
    game = pong.PongGame()

    # create a queue for experience replay to store policies
    D = deque()

    # intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing

    # stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    # saver
    saver = tf.train.Saver()

    sess.run(tf.global_variables_initializer())

    t = 0
    epsilon = INITIAL_EPSILON

    # training time
    while (1):
        # output tensor
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]
        # argmax function
        argmax_t = np.zeros([ACTIONS])

        # random action with prob epsilon
        if (random.random() <= epsilon):
            maxIndex = random.randrange(ACTIONS)
        # predicted action with prob (1 - epsilon)
        else:
            maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # reward tensor if score is positive
        reward_t, frame = game.getNextFrame(argmax_t)

        frame = np.reshape(frame, (INPUT_SIZE, INPUT_SIZE, 1))

        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        # add our input tensor, argmax tensor, reward and updated input tensor
        # to stack of experiences
        D.append((inp_t, argmax_t, reward_t, inp_t1))

        # if we run out of replay memory, make room
        if len(D) > REPLAY_MEMORY:
            D.popleft()

        # training iteration
        if t > OBSERVE:

            # get values from our replay memory
            minibatch = random.sample(D, BATCH)

            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]

            gt_batch = []
            out_batch = out.eval(feed_dict={inp: inp_t1_batch})

            # add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i]))

            # train on that
            train_step.run(feed_dict={
                gt: gt_batch,
                argmax: argmax_batch,
                inp: inp_batch
            })

        # update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1

        # print our where wer are after saving where we are
        if t % 10000 == 0:
            saver.save(sess, './' + 'pong' + '-dqn', global_step=t)

        print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
Example #17
0
def trainGraph(inp, out):

    # preparation stage - game, frames, saver and checkpoints management

    global_step = tf.Variable(0, name='global_step')
    # initialize Pong
    game = pong.PongGame()
    # get intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
    # binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    # stack frames, create the input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    # saver and checkpoints management
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=0)
    sess = tf.InteractiveSession(config=tf.ConfigProto(
        log_device_placement=True))

    checkpoint = tf.train.latest_checkpoint('./checkpoints')
    if checkpoint != None:
        print('Restore Checkpoint %s' % (checkpoint))
        saver.restore(sess, checkpoint)
        print("Model restored.")
    else:
        init = tf.global_variables_initializer()
        sess.run(init)
        print("Initialized new Graph")

    t = global_step.eval()

    # running DQN and exporting the stats
    while (1):
        # output tensor
        out_t = out.eval(feed_dict={inp: [inp_t]})[0]

        argmax_t = np.zeros([ACTIONS])
        maxIndex = np.argmax(out_t)
        argmax_t[maxIndex] = 1

        # reward tensor
        score1, score2, cumScore1, cumScore2, rewardID_player1, rewardID_player2, cumID1, cumID2, \
        rewardSE_player1, rewardSE_player2, cumSE1, cumSE2, frame = game.getNextFrame(argmax_t)

        # reward of the agent that we are testing
        if REWARD == 'rewardID_player1':
            reward_t = rewardID_player1
        if REWARD == 'rewardID_player2':
            reward_t = rewardID_player2
        if REWARD == 'rewardSE_player1':
            reward_t = rewardSE_player1
        if REWARD == 'rewardSE_player2':
            reward_t = rewardSE_player2

        # get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (60, 60, 1))
        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        # update our input tensor the the next frame
        inp_t = inp_t1
        t = t + 1

        # save stats log
        if score1 == 1 or score2 == 1:
            with open('stats_test.txt', 'a') as log:
                scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str(cumScore1) + ' cumScore2 ' + str(cumScore2) \
                + ' ID1 ' + str(rewardID_player1) + ' ID2 ' + str(rewardID_player2) + ' cumID1 ' + str(cumID1) \
                + ' cumID2 ' + str(cumID2) + ' SE1 ' + str(rewardSE_player1) + ' SE2 ' + str(rewardSE_player2) \
                + ' cumSE1 ' + str(cumSE1) + ' cumSE2 ' + str(cumSE2) + '\n'
                log.write(scoreline)

        print("TIMESTEP", t, "/ EPSILON", "0", "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))