async def play_hand_ai(callback): game = pong.PongGame(training_mode = False) while True: game.play_default_move(pong_constants.PLAYER1) game.play_default_move(pong_constants.PLAYER2) game.evolve() await callback(game)
async def async_evaluate_performance(session, graph, training_mode, callback=None): game = pong.PongGame(training_mode=training_mode) def total_points(game): return pong_stats.total_points(game.stats) while total_points(game) < config.NUM_POINTS_PER_EVALUATION: prev_stats = game.stats prev_game_state = example.model_state(game.state) if config.CHOOSE_BEST_ALWAYS: chosen_action_idx = choose_best_action(session, graph, prev_game_state) else: chosen_action_idx = choose_action(session, graph, prev_game_state) play_action_idx(game, chosen_action_idx) next_stats = game.stats if callback: await callback(game) points_did_change = (pong_stats.total_points(prev_stats) != pong_stats.total_points(next_stats)) should_log = (points_did_change and (total_points(game) % config.POINTS_PER_LOG == 0)) if should_log: print(f"eval point #{total_points(game)}") print(game.stats)
def show_game(p1): # p1 and p2 are both agents pygame.init() DISP = pygame.display.set_mode((800, 800)) pygame.display.set_caption('PONG!') game = pong.PongGame() run = True res = True move_num = 0 while run: draw(game, DISP) # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1 move_num_left = p1.get_move(game.getState(-1)) for event in pygame.event.get(): if event.type == pygame.QUIT: run = False res = game.transition(move_num_left) run = (res == 0) pygame.display.update() pygame.quit()
def get_winner(p1, p2): # p1 and p2 are both agents run = True game = pong.PongGame() res = 0 cnt = 0 while run: cnt += 1 # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1 move_num_left = p1.get_move(game.getState(-1)) move_num_right = p2.get_move(game.getState(1)) # for event in pygame.event.get(): # if event.type == pygame.QUIT: # run = False res = game.transition(move_num_left, move_num_right) run = (res == 0) if (cnt > 1000): break if cnt > 35: print('We exceeded 35 with', cnt) return res
def trainGraph(inp, out, sess): argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) #ground truth action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) cost = tf.reduce_mean(tf.square(action - gt)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) game = pong.PongGame() D = deque() trash, frame = game.nextFrame([0, 0, 0]) frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) inp_t = np.stack((frame, frame, frame, frame), axis=2) saver = tf.train.Saver() saver.restore(sess, './save/pong-dqn-80000') sess.run(tf.initialize_all_variables()) t = 0 epsilon = INITIAL_EPSILON while (1): out_t = out.eval(feed_dict={inp: [inp_t]})[0] argmax_t = np.zeros([ACTIONS]) if (random.random() <= epsilon): maxIndex = random.randrange(ACTIONS) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE reward_t, frame = game.nextFrame(argmax_t) frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) D.append((inp_t, argmax_t, reward_t, inp_t1)) if len(D) > REPLAY_MEMORY: D.popleft() if t > OBSERVE: minibatch = random.sample(D, BATCH) inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] out_batch = out.eval(feed_dict={inp: inp_t1_batch}) for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) train_step.run(feed_dict={ gt: gt_batch, argmax: argmax_batch, inp: inp_batch }) inp_t = inp_t1 t = t + 1 if t % 10000 == 0: saver.save(sess, './save/' + 'pong' + '-dqn', global_step=t) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def evaluate_action(game_state, action_idx): # Shouldn't matter what mode because we replace the state. game = pong.PongGame(training_mode=False) # Unpack the game state. game.state.paddle1_pos = game_state[0] game.state.paddle2_pos = game_state[1] game.state.ball_pos = np.array([game_state[2], game_state[3]]) game.state.ball_vel = np.array([game_state[4], game_state[5]]) prev_state = game.state prev_stats = game.stats play_action_idx(game, action_idx) next_state = game.state next_stats = game.stats return example.reward(prev_state, prev_stats, next_state, next_stats)
def train_epoch(run_info, epoch_idx): game = pong.PongGame(training_mode=config.TRAINING_MODE) losses = [] batch_idxs = range(1, config.NUM_BATCHES_PER_EPOCH + 1) for batch_idx in batch_idxs: bi = batch_info(epoch_idx, batch_idx) example.generate_data(run_info, bi, game) batch_loss = train_batch(run_info, bi) if batch_loss: losses.append(batch_loss) if batch_idx % config.BATCHES_PER_LOG == 0: log_batch(bi, losses, game) losses = []
def fitness(p1): # p1 and p2 are both agents res = 0 cnt = 0 run = True game = pong.PongGame() while run: cnt += 1 # move_num_left = np.argmax(np.dot(agnts[0].params, game.getState(0)))-1 move_num_left = p1.get_move(game.getState(-1)) res = game.transition(move_num_left) run = (res == 0) if (cnt > 10000): # print('reached 10000') break return cnt
def trainNetwork(inp, out, user_playing, use_model, filename): # initialize global variables argmax = tf.placeholder("float", [None, ACTIONS]) # define ground truth gt = tf.placeholder("float", [None]) # globally define a variable to store how many iterations have been done global_time = tf.Variable(0, name='global_time') # define optimization and cost functions (simple quadratic cost function) action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) cost = tf.reduce_mean(tf.square(action - gt)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # initialize game and replay queue game = pong.PongGame(user_playing) replay = deque() # given the frame data, want to convert it to greyscale and crop to 60 x 60 frame = game.getCurrentFrame() frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # create input tensor using numpy to stack four frames inp_t = np.stack((frame, frame, frame, frame), axis=2) # implement saver sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) saver = tf.train.Saver(tf.global_variables(), max_to_keep=None) # If filename specified, try and load that if filename is not None: checkpoint = './saves/' + filename else: # try and restore latest checkpoint checkpoint = tf.train.latest_checkpoint('./saves') if checkpoint != None: # was able to retrieve save saver.restore(sess, checkpoint) else: # no checkpoint found, initialize new neural network init = tf.global_variables_initializer() sess.run(init) t = global_time.eval() numIterations = 0 epsilon = INITIAL_EPSILON while True: starttime = time.time() # feed the input tensor into the neural network (only one index) out_t = out.eval(feed_dict={inp: [inp_t]})[0] # create an empty tensor of size ACTIONS argmax_t = np.zeros([ACTIONS]) # if < epsilon then random action, otherwise predicted action if random.random() <= epsilon and not use_model: index = choice((0, 1, 2), p=(0.9, 0.05, 0.05)) else: index = np.argmax(out_t) # update epsilon if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # set the corresponding position to 1 corresponding to the desired action argmax_t[index] = 1 if use_model: mode = 'Model Only' elif numIterations > OBSERVE: mode = 'Exploring' else: mode = 'Observing' # retrieve next frame reward_t, frame = game.getNextFrame( argmax_t, [t, np.max(out_t), epsilon, mode]) frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (60, 60, 1)) # create a new input tensor by adding the new frame and keeping previous three frames newinp_t = np.append(frame, inp_t[:, :, 0:3], axis=2) # store experience in queue for later replay.append((inp_t, argmax_t, reward_t, newinp_t)) # if we run out of memory, pop the oldest one and add new experience if len(replay) > REPLAY_MEMORY: replay.popleft() if numIterations > OBSERVE and not use_model: # have stored enough experiences, time to train using stored memories!!!! training_batch = random.sample(replay, BATCH_SIZE) # break it up into seperate lists and create a batch for ground truths inp_batch, argmax_batch, reward_batch, newinp_batch, gt_batch = [], [], [], [], [] for current in training_batch: inp_batch.append(current[0]) argmax_batch.append(current[1]) reward_batch.append(current[2]) newinp_batch.append(current[3]) # create a batch for the outputs (will be used to train) out_batch = out.eval(feed_dict={inp: newinp_batch}) # the ground truth is found using the formula gt = reward + gamma * Qmax (of next frame) for i in range(len(training_batch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) # training time, feed the batches to the train_step we defined earlier (will go where placeholders were) train_step.run(feed_dict={ argmax: argmax_batch, gt: gt_batch, inp: inp_batch }) # update the input tensor and increment t and numIterations inp_t = newinp_t t += 1 numIterations += 1 # if it is time to save the state, do so if t % SAVE_STEP == 0 and not use_model: sess.run(global_time.assign(t)) saver.save(sess, './saves/model.ckpt', global_step=t) endtime = time.time() difference = endtime - starttime if use_model and difference < (1 / FPS): time.sleep((1 / FPS) - difference) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", index, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def trainGraph(inp, out, sess): game = pong.PongGame() # to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0 argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) # ground truth # action prob = tf.nn.softmax(out) action = tf.reduce_sum(tf.multiply(prob, argmax), reduction_indices=1) # cost function we will reduce through backpropagation cost = tf.reduce_mean(tf.multiply(action, gt)) #tf.square(action - gt)) # optimization fucntion to reduce our minimize our cost function train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # initialize our game # create a queue for experience replay to store policies Point = deque() D = deque() # intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) # binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) # saver saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) t = 0 epsilon = INITIAL_EPSILON # training time while (1): # output tensor prob_eval, out_eval = sess.run([prob, out], feed_dict={inp: [inp_t]}) prob_t = prob_eval[0] out_t = out_eval[0] # argmax function argmax_t = np.zeros([ACTIONS]) # # maxIndex = np.argmax(out_t[0]) # if (random.random() > prob_t[maxIndex]): # maxIndex = random.randrange(ACTIONS) # argmax_t[maxIndex] = 1 if (random.random() <= epsilon): maxIndex = random.randrange(ACTIONS) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # reward tensor if score is positive reward_t, frame = game.getNextFrame(argmax_t) # get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) win_count = 0 # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences if (reward_t == 0): Point.append([inp_t, argmax_t, reward_t, inp_t1]) if (len(Point) > 300): Point.popleft() else: reward_array = np.zeros(shape=[len(Point) + 1]) #filling values in the reward array for i in reversed(xrange(0, len(Point) + 1)): if (i == len(Point)): Point.append([inp_t, argmax_t, reward_t, inp_t1]) reward_array[i] = reward_t else: reward_array[i] = reward_array[i + 1] * GAMMA + reward_array[i] #adding reward array values back into frames and putting them into frame queue reward_array -= np.mean(reward_array) reward_array /= np.std(reward_array) print("how long did the point last? ", len(Point)) for i in range(len(reward_array)): QueueTransfer = Point.popleft() QueueTransfer[2] = reward_array[i] D.append(QueueTransfer) if reward_t == 1: win_count += 1 # if we run out of replay memory, make room if (len(D) > REPLAY_MEMORY): for i in range(0, len(D) - REPLAY_MEMORY): D.popleft() # training iteration if (len(D) == REPLAY_MEMORY): print("training") # get values from our replay memory np.random.shuffle(D) inp_batch = [] argmax_batch = [] reward_batch = [] inp_t1_batch = [] for i in range(len(D)): temp = D.pop() inp_batch.append(temp[0]) argmax_batch.append(temp[1]) reward_batch.append(temp[2]) inp_t1_batch.append(temp[3]) c1, a1, o1 = sess.run( [cost, action, out], feed_dict={ gt: [reward_batch[-1]], argmax: [argmax_batch[-1]], inp: [inp_batch[-1]] }) # train on that train_step.run(feed_dict={ gt: reward_batch, argmax: argmax_batch, inp: inp_batch }) # update our input tensor the the next frame inp_t = inp_t1 t = t + 1 # print our where wer are after saving where we are if t % 10000 == 0: saver.save(sess, './' + 'pong' + '-dqn', global_step=t) print("TIMESTEP", t, "/ ACTION", maxIndex, "Epsilon', ", epsilon, "/ REWARD", reward_t, "/ Wins ", win_count, "/ Q_MAX %e" % np.max(out_t))
def main(): #initialize the game game = pong.PongGame() # get intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) # binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis = 2) # create and initialize graphs g1 = tf.Graph() g2 = tf.Graph() with g1.as_default(): inp_A, out_A = createGraph() saver_A = tf.train.Saver(tf.global_variables()) sess_A = tf.Session(graph=g1) checkpoint_A = tf.train.latest_checkpoint('./checkpoints_A') saver_A.restore(sess_A, checkpoint_A) with g2.as_default(): inp_B, out_B = createGraph() saver_B = tf.train.Saver(tf.global_variables()) sess_B = tf.Session(graph=g2) checkpoint_B = tf.train.latest_checkpoint('./checkpoints_B') saver_B.restore(sess_B, checkpoint_B) # keep track of timesteps t = 0 while(1): # output tensor out_t_A = out_A.eval(session=sess_A, feed_dict = {inp_A : [inp_t]})[0] out_t_B = out_B.eval(session=sess_B, feed_dict = {inp_B : [inp_t]})[0] # argmax function argmax_t_A = np.zeros([ACTIONS]) argmax_t_B = np.zeros([ACTIONS]) maxIndex_A = np.argmax(out_t_A) maxIndex_B = np.argmax(out_t_B) argmax_t_A[maxIndex_A] = 1 argmax_t_B[maxIndex_B] = 1 # reward tensor score1, score2, cumScore1, cumScore2, rewardID_player1, rewardID_player2, cumID1, cumID2, \ rewardSE_player1, rewardSE_player2, cumSE1, cumSE2, frame = game.getNextFrame(argmax_t_A, argmax_t_B) # get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (60, 60, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis = 2) # update our input tensor the the next frame inp_t = inp_t1 t = t + 1 # save stats log if score1 == 1 or score2 == 1: with open('stats_test.txt', 'a') as log: scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str(cumScore1) + ' cumScore2 ' + str(cumScore2) \ + ' ID1 ' + str(rewardID_player1) + ' ID2 ' + str(rewardID_player2) + ' cumID1 ' + str(cumID1) \ + ' cumID2 ' + str(cumID2) + ' SE1 ' + str(rewardSE_player1) + ' SE2 ' + str(rewardSE_player2) \ + ' cumSE1 ' + str(cumSE1) + ' cumSE2 ' + str(cumSE2) + '\n' log.write(scoreline) print("TIMESTEP", t, "/ EPSILON", "0")
def trainGraph(inp, out): #to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0 argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) #ground truth global_step = tf.Variable(0, name='global_step') #action action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices = 1) #cost function we will reduce through backpropagation cost = tf.reduce_mean(tf.square(action - gt)) #optimization fucntion to reduce our minimize our cost function train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) #initialize our game game = pong.PongGame() #create a queue for experience replay to store policies D = deque() #intial frame frame = game.getPresentFrame() #convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) #binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) #stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis = 2) #saver saver = tf.train.Saver(tf.global_variables()) # use a SessionManager to help with automatic variable restoration sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True)) checkpoint = tf.train.latest_checkpoint('./checkpoints') if checkpoint != None: print('Restore Checkpoint %s'%(checkpoint)) saver.restore(sess, checkpoint) print("Model restored.") else: init = tf.global_variables_initializer() sess.run(init) print("Initialized new Graph") t = global_step.eval() c= 0 epsilon = INITIAL_EPSILON #training time while(1): #output tensor out_t = out.eval(feed_dict = {inp : [inp_t]})[0] #argmax function argmax_t = np.zeros([ACTIONS]) # if(random.random() <= epsilon and not USE_MODEL): # make 0 the most choosen action for realistic randomness maxIndex = choice((0,1,2), 1, p=(0.90, 0.05,0.05)) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE mode = 'observing' if t > OBSERVE: mode = 'training' if USE_MODEL: mode = 'model only' #reward tensor if score is positive reward_t, frame = game.getNextFrame(argmax_t, [t, np.max(out_t), epsilon, mode]) #get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (60, 60, 1)) #new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis = 2) #add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences D.append((inp_t, argmax_t, reward_t, inp_t1)) #if we run out of replay memory, make room if len(D) > REPLAY_MEMORY: D.popleft() #training iteration if c > OBSERVE and not USE_MODEL: #get values from our replay memory minibatch = random.sample(D, BATCH) inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] out_batch = out.eval(feed_dict = {inp : inp_t1_batch}) #add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) #train on that train_step.run(feed_dict = { gt : gt_batch, argmax : argmax_batch, inp : inp_batch }) #update our input tensor the the next frame inp_t = inp_t1 t = t + 1 c = c + 1 #print our where wer are after saving where we are if t % SAVE_STEP == 0 and not USE_MODEL: sess.run(global_step.assign(t)) saver.save(sess, './checkpoints/model.ckpt', global_step=t) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def trainAgent(net): # initialize our game game = pong.PongGame() # create a queue for experience replay to store policies # and set the maxlength equals to the size of replay memory D = deque(maxlen=REPLAY_MEMORY) # intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) # binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) Network = net().cuda() optimizer = optim.Adam(Network.parameters(), lr=1e-5) criterion = nn.MSELoss() writer = SummaryWriter(log_dir='./logs') if os.path.exists('./params.pkl'): print('Restore from exists model') Network.load_state_dict(torch.load('./params.pkl')) # TODO: record steps steps = 0 else: steps = 0 expected_epsilon = INITIAL_EPSILON - steps * (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE if expected_epsilon > FINAL_EPSILON: epsilon = expected_epsilon else: epsilon = FINAL_EPSILON total_observe = steps + ADDITIONAL_OB # training time while(1): out_t = Network(to_tensor(inp_t)) # argmax function argmax_t = np.zeros([ACTIONS]) if random.random() <= epsilon: maxIndex = random.randrange(ACTIONS) else: _, maxIndex = torch.max(out_t, 1) maxIndex = maxIndex.cpu().numpy()[0] argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # reward tensor if score is positive reward_t, frame, hit_rate, hit_rate_100 = game.getNextFrame(argmax_t) # get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences D.append((inp_t, argmax_t, reward_t, inp_t1)) # training iteration if steps > total_observe: # get values from our replay memory minibatch = random.sample(D, BATCH) # minibatch = np.array(minibatch).transpose() inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] # out_batch = out.eval(feed_dict={inp: inp_t1_batch}) out_prev_batch = Network(to_tensor(inp_batch)) out_batch = Network(to_tensor(inp_t1_batch)) # add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch.data.cpu().numpy()[i])) # action = np.mean(np.multiply(argmax_batch, out_prev_batch.data.cpu().numpy()), axis=1) action = torch.sum(out_prev_batch.mul(torch.FloatTensor(argmax_batch).cuda()), dim=1) gt_batch = torch.FloatTensor(reward_batch).cuda() + GAMMA * out_batch.max(1)[0] gt_batch = torch.autograd.Variable(gt_batch, requires_grad=False) loss = criterion(action, gt_batch) optimizer.zero_grad() loss.backward() optimizer.step() # update our input tensor the the next frame inp_t = inp_t1 steps += 1 # record the agent's performance every 100 steps if steps % 100 == 0: writer.add_scalars('', {'hit_rate': hit_rate, 'hit_rate_100': hit_rate_100}, steps) # print our where we are after saving where we are if steps % 10000 == 0: torch.save(Network.state_dict(), './params.pkl') print("TIMESTEP", steps, "/ EPSILON %7.5f" % epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % torch.max(out_t)) # stop traing after 1M steps if steps > 1000000: break
def trainGraph(inp, out): # preparation stage - game, frames, saver and checkpoints management # to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0 argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) # ground truth global_step = tf.Variable(0, name='global_step') # action action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) # cost function which we will reduce through backpropagation # what's the action ??? cost = tf.reduce_mean(tf.square(action - gt)) # optimization function to minimize our cost function train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # initialize the game game = pong.PongGame(0) # create a queue for experience replay to store policies DL = deque() #DR = deque() # get intial frame frame = game.getPresentFrame() # stack frames, create the input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) # saver and checkpoints management saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True sess = tf.InteractiveSession(config=config) #sess = tf.InteractiveSession(config=tf.ConfigProto(log_device_placement=True)) checkpoint = tf.train.latest_checkpoint('./checkpoints') if checkpoint != None: print('Restore Checkpoint %s' % (checkpoint)) saver.restore(sess, checkpoint) print("Model restored.") else: init = tf.global_variables_initializer() sess.run(init) print("Initialized new Graph") t = global_step.eval() c = 0 epsilon = INITIAL_EPSILON train_side = 0 # training DQN and exporting stats while (1): # output tensor out_t = out.eval(feed_dict={inp: [inp_t]})[0] # argmax function argmax_t = np.zeros([ACTIONS]) # pick action if (random.random() <= epsilon and not USE_MODEL): maxIndex = random.randrange(ACTIONS) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE if train_side == 1: action_tuple = (None, argmax_t) else: action_tuple = (argmax_t, None) # My first goal is to run one agent that can beat the opponent both sides frame, rewards, done, cumScores = game.getNextFrame(action_tuple) reward_t = rewards[train_side] # flip the image data to remian the same view as the left side if train_side == 1: frame = frame[::-1, :] #cv2.imwrite("imgs/%i_train_side.png" % t, frame) frame = np.reshape(frame, (60, 60, 1)) inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) #if train_side == 1: # DR.append((inp_t, argmax_t, reward_t, inp_t1)) # if len(DR) > REPLAY_MEMORY: # DR.popleft() #else: DL.append((inp_t, argmax_t, reward_t, inp_t1)) if len(DL) > REPLAY_MEMORY: DL.popleft() if c > OBSERVE and not USE_MODEL: #if train_side == 1: # minibatch = random.sample(DR, BATCH) #else: minibatch = random.sample(DL, BATCH) inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] out_batch = out.eval(feed_dict={inp: inp_t1_batch}) # add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) # train on that train_step.run(feed_dict={ gt: gt_batch, argmax: argmax_batch, inp: inp_batch }) # update our input tensor the the next frame inp_t = inp_t1 t = t + 1 c = c + 1 # save checkints if t % SAVE_STEP == 0 and not USE_MODEL: sess.run(global_step.assign(t)) saver.save(sess, './checkpoints/' + 'model.ckpt', global_step=t) # save stats log if done: with open('stats_test.txt', 'a') as log: scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str( cumScores[train_side]) + ' cumScore2 ' + str( cumScores[1 - train_side]) + ' train side ' + str( train_side) + '\n' print(scoreline.strip()) log.write(scoreline) train_side = 1 - train_side
def trainGraph(inp, out, sess): #to calculate the argmax, we multiply the predicted output with a vector with one value 1 and rest as 0 argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) #ground truth #action action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) #cost function we will reduce through backpropagation cost = tf.reduce_mean(tf.square(action - gt)) #optimization fucntion to reduce our minimize our cost function train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) #initialize our game game = pong.PongGame() #create a queue for experience replay to store policies Point = deque() D = deque() #intial frame frame = game.getPresentFrame() #convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) #binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) #stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) #saver saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) t = 0 epsilon = INITIAL_EPSILON #training time while (1): #output tensor out_t = out.eval(feed_dict={inp: [inp_t]})[0] #argmax function argmax_t = np.zeros([ACTIONS]) # if (random.random() <= epsilon): maxIndex = random.randrange(ACTIONS) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE #reward tensor if score is positive reward_t, frame = game.getNextFrame(argmax_t) #get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) #new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) #add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences if (reward_t == 0): Point.append((inp_t, argmax_t, reward_t, inp_t1)) else: framearray = np.empty(shape=(len(Point) + 1), dtype=object) for i in range(len(Point) + 1): if (i == 0): framearray[i] = (inp_t, argmax_t, reward_t, inp_t1) else: temp = Point.pop() framearray[i] = temp D.append(framearray) #if we run out of replay memory, make room if len(D) > REPLAY_MEMORY: D.popleft() #training iteration if len(D) > 0: if (BATCH > len(D)): currentBATCH = len(D) else: currentBATCH = BATCH #get values from our replay memory minibatch = random.sample(D, currentBATCH) inp_batch = [] argmax_batch = [] reward_batch = [] inp_t1_batch = [] for i in range(len(minibatch)): for j in range(len(minibatch[i])): inp_batch.append(minibatch[i][j][0]) argmax_batch.append(minibatch[i][j][1]) reward_batch.append(minibatch[i][j][2]) inp_t1_batch.append(minibatch[i][j][3]) gt_batch = [] out_batch = out.eval(feed_dict={inp: inp_t1_batch}) #add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) #train on that train_step.run(feed_dict={ gt: gt_batch, argmax: argmax_batch, inp: inp_batch }) #update our input tensor the the next frame inp_t = inp_t1 t = t + 1 #print our where wer are after saving where we are if t % 10000 == 0: saver.save(sess, './' + 'pong' + '-dqn', global_step=t) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def trainGraph(inp, out, sess): # to calculate the argmax, we multiply the predicted output with a vector # with one value 1 and rest as 0 argmax = tf.placeholder("float", [None, ACTIONS]) gt = tf.placeholder("float", [None]) # ground truth # action action = tf.reduce_sum(tf.multiply(out, argmax), reduction_indices=1) # cost function we will reduce through backpropagation cost = tf.reduce_mean(tf.square(action - gt)) # optimization function to reduce our minimize our cost function train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # initialize our game game = pong.PongGame() # create a queue for experience replay to store policies D = deque() # intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing # stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) # saver saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) t = 0 epsilon = INITIAL_EPSILON # training time while (1): # output tensor out_t = out.eval(feed_dict={inp: [inp_t]})[0] # argmax function argmax_t = np.zeros([ACTIONS]) # random action with prob epsilon if (random.random() <= epsilon): maxIndex = random.randrange(ACTIONS) # predicted action with prob (1 - epsilon) else: maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # reward tensor if score is positive reward_t, frame = game.getNextFrame(argmax_t) frame = np.reshape(frame, (INPUT_SIZE, INPUT_SIZE, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) # add our input tensor, argmax tensor, reward and updated input tensor # to stack of experiences D.append((inp_t, argmax_t, reward_t, inp_t1)) # if we run out of replay memory, make room if len(D) > REPLAY_MEMORY: D.popleft() # training iteration if t > OBSERVE: # get values from our replay memory minibatch = random.sample(D, BATCH) inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] out_batch = out.eval(feed_dict={inp: inp_t1_batch}) # add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch[i])) # train on that train_step.run(feed_dict={ gt: gt_batch, argmax: argmax_batch, inp: inp_batch }) # update our input tensor the the next frame inp_t = inp_t1 t = t + 1 # print our where wer are after saving where we are if t % 10000 == 0: saver.save(sess, './' + 'pong' + '-dqn', global_step=t) print("TIMESTEP", t, "/ EPSILON", epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))
def trainGraph(inp, out): # preparation stage - game, frames, saver and checkpoints management global_step = tf.Variable(0, name='global_step') # initialize Pong game = pong.PongGame() # get intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) # binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # stack frames, create the input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) # saver and checkpoints management saver = tf.train.Saver(tf.global_variables(), max_to_keep=0) sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) checkpoint = tf.train.latest_checkpoint('./checkpoints') if checkpoint != None: print('Restore Checkpoint %s' % (checkpoint)) saver.restore(sess, checkpoint) print("Model restored.") else: init = tf.global_variables_initializer() sess.run(init) print("Initialized new Graph") t = global_step.eval() # running DQN and exporting the stats while (1): # output tensor out_t = out.eval(feed_dict={inp: [inp_t]})[0] argmax_t = np.zeros([ACTIONS]) maxIndex = np.argmax(out_t) argmax_t[maxIndex] = 1 # reward tensor score1, score2, cumScore1, cumScore2, rewardID_player1, rewardID_player2, cumID1, cumID2, \ rewardSE_player1, rewardSE_player2, cumSE1, cumSE2, frame = game.getNextFrame(argmax_t) # reward of the agent that we are testing if REWARD == 'rewardID_player1': reward_t = rewardID_player1 if REWARD == 'rewardID_player2': reward_t = rewardID_player2 if REWARD == 'rewardSE_player1': reward_t = rewardSE_player1 if REWARD == 'rewardSE_player2': reward_t = rewardSE_player2 # get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (60, 60)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (60, 60, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) # update our input tensor the the next frame inp_t = inp_t1 t = t + 1 # save stats log if score1 == 1 or score2 == 1: with open('stats_test.txt', 'a') as log: scoreline = 'TIMESTEP ' + str(t) + ' cumScore1 ' + str(cumScore1) + ' cumScore2 ' + str(cumScore2) \ + ' ID1 ' + str(rewardID_player1) + ' ID2 ' + str(rewardID_player2) + ' cumID1 ' + str(cumID1) \ + ' cumID2 ' + str(cumID2) + ' SE1 ' + str(rewardSE_player1) + ' SE2 ' + str(rewardSE_player2) \ + ' cumSE1 ' + str(cumSE1) + ' cumSE2 ' + str(cumSE2) + '\n' log.write(scoreline) print("TIMESTEP", t, "/ EPSILON", "0", "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % np.max(out_t))