def playGame(sess,net): # open up a game state to communicate with emulator game_state = game.GameState() agent = Agent.Agent(sess) x_t, r_0, terminal = game_state.frame_step([1, 0, 0]) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) aux_s = s_t # get the first state by doing nothing and preprocess the image to 80x80x4 score = 0 while not terminal: # choose an action action = agent.choose_action_play(net, s_t) # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(action) score += r_t x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) x_t1 = np.reshape(x_t1, (80, 80, 1)) aux_s = np.delete(s_t, 0, axis = 2) s_t1 = np.append(aux_s, x_t1, axis = 2) # update state and score s_t = s_t1 # Print final score print "FINAL SCORE1", score
def __init__(self, game_name): if game_name == "pong": # open up a game state to communicate with emulator import pong_fun as game self.game_name = "pong" self.game_state = game.GameState() self.action_number = 3 if game_name == "gym": self.game_name = "gym" #self.game_state = gym.make('FlappyBird-v0') self.game_state = gym.make('Breakout-v0') self.action_number = self.game_state.action_space.n if game_name == "bird_black": # open up a game state to communicate with emulator import wrapped_flappy_bird as game self.game_name = "bird_black" self.game_state = game.GameState() self.action_number = 2
def playGame(sess): # open up a game state to communicate with emulator game_state = game.GameState() score = 0 # get the first state by doing nothing and preprocess the image to 80x80x4 x_t, r_0, terminal = game_state.frame_step([1, 0, 0]) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) aux_s = s_t t = 0 while not terminal: # choose an action readout_t = O_readout.eval(session=sess, feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = np.argmax(readout_t) a_t[action_index] = 1 # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) x_t1 = np.reshape(x_t1, (80, 80, 1)) aux_s = np.delete(s_t, 0, axis=2) s_t1 = np.append(aux_s, x_t1, axis=2) # update state and score s_t = s_t1 t += 1 score += r_t print "TIMESTEP", t, "/ ACTION", action_index, "/ REWARD", r_t print readout_t # Print final score print "FINAL SCORE", score
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing #a_file = open("logs_" + GAME + "/readout.txt", 'w') #h_file = open("logs_" + GAME + "/hidden.txt", 'w') plt.show() axes = plt.gca() axes.set_xlim(0, 10000) axes.set_ylim(-0.5, 0.5) line, = axes.plot(xdata, reworddata, 'r-') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) print('x siz', x_t.min()) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks #saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) #checkpoint = tf.train.get_checkpoint_state("saved_networks") #if checkpoint and checkpoint.model_checkpoint_path: # saver.restore(sess, checkpoint.model_checkpoint_path) # print("Successfully loaded:", checkpoint.model_checkpoint_path) #else: # print("Could not find old network weights") epsilon = INITIAL_EPSILON t = 0 while "pigs" != "fly": # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # print('maxreadout is', np.max(readout_j1_batch[i]),'y_batch is', (y_batch[i]),'r_batch is', (r_batch[i])) # print('y_batch is', (y_batch[i])) # print('y_batch is', (r_batch[i])) # perform gradient step train_stepe, coste, readout_actione, h_fc11 = sess.run( [train_step, cost, readout_action, h_fc1], feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }) #mprint('cost is',coste,'readout_actione is', readout_actione,'NetworkWeight',np.asarray(h_fc11[:,:,0,0])) #xdata.append(t) #reworddata.append(coste) #line.set_xdata(xdata) #line.set_ydata(reworddata) #plt.draw() # plt.pause(1e-200) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations #if t % 10000 == 0: # saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)) # write info to files '''
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing a_file = open("logs_" + GAME + "/readout.txt", 'w') h_file = open("logs_" + GAME + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) #saver.restore(sess, "/tmp/model.ckpt") epsilon = INITIAL_EPSILON # observe state for t in range(int(OBSERVE)): # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = random.randrange(ACTIONS) a_t[action_index] = 1 # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # update the old values s_t = s_t1 # print info state = "observe" print ("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)) t = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict = {s : [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon: action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) #todo: ? not reward # perform gradient step train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, "/save/model-" + str(t) + ".ckpt") # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" #print ("TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)) print ("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t)) # write info to files '''
def trainNetwork(s, readout, h_fc1, sess): tick = time.time() a = tf.placeholder("float", [None, actions]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) game_state = game.GameState() # past 3 wins win_score = [] win_score.append(0) win_score.append(0) win_score.append(0) win_score.append(0) # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(actions) do_nothing[0] = 1 x_t, r_0, terminal, bar1_score, bar2_score = game_state.frame_step( do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) observe = 500. explore = 500. FINAL_EPSILON = 0.05 INITIAL_EPSILON = 1.0 epsilon = INITIAL_EPSILON t = 0 K = 1 while True: readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([actions]) action_index = 0 if random.random() <= epsilon or t <= observe: action_index = random.randrange(actions) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 if epsilon > FINAL_EPSILON and t > observe: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / explore for i in range(0, K): x_t1_col, r_t, terminal, bar1_score, bar2_score = game_state.frame_step( a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2) D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > replay_memory: D.popleft() if t > observe: minibatch = random.sample(D, batch) s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + gamma * np.max(readout_j1_batch[i])) train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) s_t = s_t1 t += 1 if r_t != 0: print("Timestep", t, " Score", bar1_score) win_score.pop(0) win_score.append(bar1_score - bar2_score) if (np.matrix(win_score).sum() > 72): #72 print("Game_Ends_in Time:", int(time.time() - tick)) break
def trainNetwork(s, readout_net1, readout_net2, readout_netb1, readout_netb2, sess): # define the cost function [train_step_net1, y_net1, a_net1] = getTrainStep(readout_net1) [train_step_net2, y_net2, a_net2] = getTrainStep(readout_net2) [train_step_netb1, y_netb1, a_netb1] = getTrainStep(readout_netb1) [train_step_netb2, y_netb2, a_netb2] = getTrainStep(readout_netb2) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing a_file = open("logs_" + args.game_log_name + "/readout.txt", 'w') h_file = open("logs_" + args.game_log_name + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(args.action_count) do_nothing[0] = 1 x_t, r_0, r_1, terminal = game_state.frame_step(do_nothing, do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) if (args.figure): fig = plt.figure() plt.imshow(x_t.T) fig.savefig(args.figure_name) import time #time.sleep(5) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("saved_networks/") print checkpoint.model_checkpoint_path #saver.restore(sess, checkpoint.model_checkpoint_path) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" epsilon = args.initial_epsilon t = 0 net_flag = 0 cnt = 0 while True: # choose an action epsilon greedily if net_flag == 0: readout_t = readout_net1.eval(feed_dict={s: [s_t]})[0] readout_bt = readout_netb1.eval(feed_dict={s: [s_t]})[0] else: readout_t = readout_net2.eval(feed_dict={s: [s_t]})[0] readout_bt = readout_netb2.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([args.action_count]) a_bt = np.zeros([args.action_count]) action_index = 0 if random.random() <= epsilon or t <= args.observation_count: action_index = random.randrange(args.action_count) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 action_indexb = 0 if random.random() <= epsilon or t <= args.observation_count: action_indexb = random.randrange(args.action_count) a_bt[action_indexb] = 1 else: action_indexb = np.argmax(readout_bt) a_bt[action_indexb] = 1 # scale down epsilon if epsilon > args.final_epsilon and t > args.observation_count: epsilon -= (args.initial_epsilon - args.final_epsilon) / args.explore_frames for i in range(0, args.k): # run the selected action and observe next state and reward x_t1_col, r_t, r_bt, terminal = game_state.frame_step(a_t, a_bt) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(s_t[:, :, 1:], x_t1, axis=2) # store the transition in D ''' if t==5: fig1 = plt.figure() plt.imshow(s_t[:,:,0].T) fig1.savefig('trrr_1.png') fig2 = plt.figure() plt.imshow(s_t[:,:,1].T) fig2.savefig('trrr_2.png') fig3 = plt.figure() plt.imshow(s_t[:,:,2].T) fig3.savefig('trrr_3.png') fig4 = plt.figure() plt.imshow(s_t[:,:,3].T) fig4.savefig('trrr_4.png') time.sleep(5) ''' D.append((s_t, a_t, r_t, a_bt, r_bt, s_t1, terminal)) if len(D) > args.replay_memory: D.popleft() # only train if done observing if t > args.observation_count: cnt = cnt + 1 # sample a minibatch to train on ''' minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] ab_batch = [d[3] for d in minibatch] rb_batch = [d[4] for d in minibatch] s_j1_batch = [d[5] for d in minibatch] y_batch = [] yb_batch = [] if net_flag == 0: readout_j1_batch = readout_net2.eval(feed_dict = {s : s_j1_batch}) readoutb_j1_batch = readout_netb2.eval(feed_dict = {s : s_j1_batch}) else: readout_j1_batch = readout_net1.eval(feed_dict = {s : s_j1_batch}) readoutb_j1_batch = readout_netb1.eval(feed_dict = {s : s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][6]: y_batch.append(r_batch[i]) yb_batch.append(rb_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) yb_batch.append(rb_batch[i] + GAMMA * np.max(readoutb_j1_batch[i])) # perform gradient step if net_flag == 0: train_step_net2.run( feed_dict = {y_net2 : y_batch,a_net2 : a_batch, s : s_j_batch} ) train_step_netb2.run( feed_dict = {y_netb2 : yb_batch,a_netb2 : ab_batch, s : s_j_batch} ) else: train_step_net1.run( feed_dict = {y_net1 : y_batch,a_net1 : a_batch, s : s_j_batch} ) train_step_netb1.run( feed_dict = {y_netb1 : yb_batch,a_netb1 : ab_batch, s : s_j_batch} ) ''' # update the old values if cnt % args.switch_net == 0: if net_flag == 0: net_flag = 1 else: net_flag = 0 #print 'SwitchState' s_t = s_t1 t += 1 # print info state = "" if t <= args.observation_count: state = "observe" elif t > args.observation_count and t <= args.observation_count + args.explore_frames: state = "explore" else: state = "train" #if r_t != 0: # print "TIMESTEP", t, "/ STATE", state, "/ LINES", game_state.total_lines, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t) if r_bt != 0: print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_indexb, "/ REWARD", r_bt, "/ Q_MAX %e" % np.max( readout_bt) if r_t != 0: print "TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max( readout_t) # write info to files ''' if t % 10000 <= 100: a_file.write(",".join([str(x) for x in readout_t]) + '\n') h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n') cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1) ''' if (t and t % args.terminate_prompt == 0): print "TIMESTEP = " + str(t) if (yes_or_no("Want to terminate the code")): return else: continue
def trainNetwork(s, readout, h_fc1, sess, merged, writer): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) # readout_action = tf.reduce_sum(tf.mul(readout, a), reduction_indices = 1) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing a_file = open(out_put_path + "/readout.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state(store_network_path) #saver.restore(sess, "new_networks/pong-dqn-"+str(pretrain_number)) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) #saver.restore(sess, "my_networks/pong-dqn-26000") print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") print("Press any key and Enter to continue:") # raw_input() epsilon = INITIAL_EPSILON t = 0 total_score = 0 positive_score = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2) total_score = total_score + r_t # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() if r_t == 1: positive_score = positive_score + r_t # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, store_network_path + GAME + '-dqn', global_step=t + pretrain_number) #saver.save(sess, 'new_networks/' + GAME + '-dqn', global_step = t) if t % 500 == 0: now = datetime.datetime.now() diff_seconds = (now - start).seconds time_text = sencond2time(diff_seconds) result = sess.run(merged, feed_dict={s: [s_t]}) writer.add_summary(result, t + pretrain_number) a_file.write(str(t+pretrain_number)+','+",".join([str(x) for x in readout_t]) + \ ','+str(total_score)+ ','+str(positive_score) \ +','+time_text+'\n') # print info print ("TIMESTEP:", t+pretrain_number, "/ ACTION:", action_index, "/ REWARD:", r_t, "/ Q_MAX: %e" % np.max(readout_t),' time:(H,M,S):' \ + sencond2time((datetime.datetime.now()-start).seconds)) print('Total score:', total_score, ' Positive_score:', positive_score, ' up:', readout_t[0], ' down:', readout_t[1], ' no:', readout_t[2])
def trainNetwork(model, args): # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal, _ = game_state.frame_step(do_nothing) x_t = skimage.color.rgb2gray(x_t) x_t = skimage.transform.resize(x_t, (80, 80)) x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) s_t = np.stack((x_t, x_t, x_t, x_t), axis=0) # In Keras, need to reshape s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2]) learning_mode = 0 # 2 for learng based on human, 3 for reverse reinforcement if args['mode'] == 'Run': OBSERVE = 999999999 # We keep observe, never train epsilon = FINAL_EPSILON print("Now we load weight") model.load_weights("model1.h5") adam = Adam(lr=1e-6) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") training_mode = False # running os.mkdir("pic", 0755) a_file = open("logs_" + GAME + "/logfile_" + str(counter) + ".txt", 'a') else: # We go to training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON learning_mode = int(args['learning_mode']) if os.path.isfile("model.h5"): # check if file exists. model.load_weights("model.h5") adam = Adam(lr=1e-6) model.compile(loss='mse', optimizer=adam) print("Weight load successfully") # printing log file training_mode = True # training j = 1 os.mkdir("pic/" + str(j), 0755) t = 0 pic_counter = 0 while (True): loss = 0 Q_sa = 0 action_index = 0 r_t = 0 a_t = np.zeros([ACTIONS]) # choose an action epsilon greedy if t % FRAME_PER_ACTION == 0: if not training_mode: # running q = model.predict( s_t) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index = max_Q a_t[action_index] = 1 # We reduced the epsilon gradually if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observed next state and reward x_t1_colored, r_t, terminal, score = game_state.frame_step(a_t) game_over = terminal x_t1 = skimage.color.rgb2gray(x_t1_colored) if (score >= 0): fig1 = plt.figure(pic_counter) plt.imshow(x_t1_colored) print('time now: ', datetime.datetime.now()) fig1.savefig('pic/' + str(j) + '/' + str(pic_counter) + 'colored pic.png') plt.close() x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1]) s_t1 = np.append(x_t1, s_t[:, :3, :, :], axis=1) # store the transition in D D.append((s_t, action_index, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() s_t = s_t1 t = t + 1 pic_counter += 1 # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if (game_over): j = j + 1 os.mkdir("pic/" + str(j), 0755) print(j, "score: ", score, file=a_file) a_file.flush() pic_counter = 0 print("Episode finished!") print("************************")
def trainNetwork(model1, model2, args): player1_wins_in_a_row = 0 player2_wins_in_a_row = 0 player1_num_of_trains = 0 player2_num_of_trains = 0 # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D1 = deque() D2 = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t1, r_0, terminal, _, _ = game_state.frame_step(do_nothing, do_nothing) x_t1 = skimage.color.rgb2gray(x_t1) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t2 = np.flipud(x_t1) player1_curr_state = np.stack((x_t1, x_t1, x_t1, x_t1), axis=0) player1_curr_state = player1_curr_state.reshape( 1, player1_curr_state.shape[0], player1_curr_state.shape[1], player1_curr_state.shape[2]) player2_curr_state = np.stack((x_t2, x_t2, x_t2, x_t2), axis=0) player2_curr_state = player2_curr_state.reshape( 1, player2_curr_state.shape[0], player2_curr_state.shape[1], player2_curr_state.shape[2]) #training mode OBSERVE = OBSERVATION epsilon = INITIAL_EPSILON #moving old trials to old_trials folder if os.path.exists("trials_simultaneously"): copytree("trials_simultaneously", "old_trials_simultaneously") shutil.rmtree("trials_simultaneously") os.mkdir("trials_simultaneously", 0755) learning_mode = int(args['learning_mode']) #which player learns if os.path.isfile("model1.h5"): model1.load_weights("model1.h5") if os.path.isfile("model2.h5"): model2.load_weights("model2.h5") adam = Adam(lr=1e-6) model1.compile(loss='mse', optimizer=adam) model2.compile(loss='mse', optimizer=adam) print("Weights loaded successfully") training_mode = True # training observation_counter = 0 num_folder = 0 start_time = datetime.datetime.now() observation_counter = 0 while (True): loss1 = 0 loss2 = 0 Q_sa1 = 0 action_index1 = 0 r_t1 = 0 loss2 = 0 Q_sa2 = 0 action_index2 = 0 r_t2 = 0 a_t1 = np.zeros([ACTIONS]) a_t2 = np.zeros([ACTIONS]) #choose an action epsilon greedy if (observation_counter % FRAME_PER_ACTION) == 0: if random.random() <= epsilon: # for flayer1 action_index1 = random.randrange(ACTIONS) a_t1[action_index1] = 1 else: q = model1.predict( player1_curr_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index1 = max_Q a_t1[action_index1] = 1 if random.random() <= epsilon: # for flayer2 action_index2 = random.randrange(ACTIONS) a_t2[action_index2] = 1 else: q = model2.predict( player2_curr_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index2 = max_Q a_t2[action_index2] = 1 #We reduced the epsilon gradually if (epsilon > FINAL_EPSILON) and (observation_counter > OBSERVE): epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observed next state and reward x_t1_colored, r_t1, terminal, score, _ = game_state.frame_step( a_t1, a_t2) r_t2 = r_t1 * (-1) game_over = terminal x_t1_grey = skimage.color.rgb2gray(x_t1_colored) thresh = threshold_otsu(x_t1_grey) x_t1 = x_t1_grey > thresh # binary image x_t2 = np.flipud(x_t1) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) x_t2 = skimage.transform.resize(x_t2, (80, 80)) x_t2 = skimage.exposure.rescale_intensity(x_t2, out_range=(0, 255)) x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1]) x_t2 = x_t2.reshape(1, 1, x_t2.shape[0], x_t2.shape[1]) player1_next_state = np.append(x_t1, player1_curr_state[:, :3, :, :], axis=1) player2_next_state = np.append(x_t2, player2_curr_state[:, :3, :, :], axis=1) # store the transition in D D1.append((player1_curr_state, action_index1, r_t1, player1_next_state, terminal)) if len(D1) > REPLAY_MEMORY: D1.popleft() D2.append((player2_curr_state, action_index2, r_t2, player2_next_state, terminal)) if len(D2) > REPLAY_MEMORY: D2.popleft() if observation_counter > OBSERVE: # sample a minibatch to train on minibatch1 = random.sample(D1, BATCH) minibatch2 = random.sample(D2, BATCH) inputs1 = np.zeros((BATCH, IMAGE_DEPTH, IMAGE_WIDTH, IMAGE_HEIGHT)) # 32, 4, 80, 80 targets1 = np.zeros((BATCH, ACTIONS)) # 32, 2 inputs2 = np.zeros((BATCH, IMAGE_DEPTH, IMAGE_WIDTH, IMAGE_HEIGHT)) # 32, 4, 80, 80 targets2 = np.zeros((BATCH, ACTIONS)) # 32, 2 # Now we do the experience replay for i in range(0, len(minibatch1)): curr_state_t1 = minibatch1[i][0] action_t1 = minibatch1[i][1] # This is action index reward_t1 = minibatch1[i][2] next_state_t1 = minibatch1[i][3] terminal1 = minibatch1[i][4] # if terminated, only equals reward inputs1[i:i + 1] = curr_state_t1 # I saved down s_t1 targets1[i] = model1.predict( curr_state_t1) # Hitting each buttom probability Q_sa1 = model1.predict(curr_state_t1) if terminal1: targets1[i, action_t1] = reward_t1 else: targets1[i, action_t1] = reward_t1 + GAMMA * np.max(Q_sa1) loss1 += model1.train_on_batch(inputs1, targets1) # Now we do the experience replay for i in range(0, len(minibatch2)): curr_state_t2 = minibatch2[i][0] action_t2 = minibatch2[i][1] # This is action index reward_t2 = minibatch2[i][2] next_state_t2 = minibatch2[i][3] terminal2 = minibatch2[i][4] # if terminated, only equals reward inputs2[i:i + 1] = curr_state_t2 targets2[i] = model2.predict( curr_state_t2) # Hitting each buttom probability Q_sa2 = model2.predict(curr_state_t2) if terminal2: targets2[i, action_t2] = reward_t2 else: targets2[i, action_t2] = reward_t2 + GAMMA * np.max(Q_sa2) loss2 += model2.train_on_batch(inputs2, targets2) player1_curr_state = player1_next_state player2_curr_state = player2_next_state observation_counter = observation_counter + 1 # save progress every 10000 iterations if observation_counter % 100 == 0: #print("Now we save model") if learning_mode == 1: model1.save_weights("model1.h5", overwrite=True) with open("model1.json", "w") as outfile1: json.dump(model1.to_json(), outfile1) elif learning_mode == 2: model2.save_weights("model2.h5", overwrite=True) with open("model2.json", "w") as outfile2: json.dump(model2.to_json(), outfile2) current_time = datetime.datetime.now() elapsedTime = (current_time - start_time).total_seconds() if (elapsedTime >= 30 * 60): num_folder += 1 start_time = datetime.datetime.now() os.makedirs( "trials_simultaneously/" + "player" + str(1) + "learning" + "/" + str(num_folder), 0755) shutil.copy2( 'model1.h5', "trials_simultaneously/" + "player" + str(1) + "learning" + "/" + str(num_folder) + '/model1.h5') os.makedirs( "trials_simultaneously/" + "player" + str(2) + "learning" + "/" + str(num_folder), 0755) shutil.copy2( 'model2.h5', "trials_simultaneously/" + "player" + str(2) + "learning" + "/" + str(num_folder) + '/model2.h5') if (game_over): if score[0] < score[1]: player2_wins_in_a_row = player2_wins_in_a_row + 1 player1_wins_in_a_row = 0 percentage = 0.0 elif score[1] < score[0]: player1_wins_in_a_row = player1_wins_in_a_row + 1 player2_wins_in_a_row = 0 percentage = 1.0 else: percentage = (score[0] / float((score[0] + score[1]))) print("Episode finished!") print("************************")
def trainNetwork(s, coeff, readout, sess): tick = time.time() # define the cost function a = tf.placeholder("float", [None,actions]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices = 1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-4).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory replay_memory = 100000 D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(actions) do_nothing[0] = 1 x_t, r_0, terminal, bar1_score, bar2_score = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (84, 84)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) # saving and loading networks saver = tf.train.Saver() #sess.run(tf.initialize_all_variables()) sess.run(tf.global_variables_initializer()) b_IJ1 = np.zeros((1, 1152, 10, 1, 1)).astype(np.float32) # batch_size=1 b_IJ2 = np.zeros((batch, 1152, 10, 1, 1)).astype(np.float32) # batch_size=BATCH FINAL_EPSILON = 0.05 INITIAL_EPSILON = 1.0 epsilon = INITIAL_EPSILON t = 0 episode = 0 OBSERVE = 1000 EXPLORE = 5000 while True: readout_t = readout.eval(feed_dict = {s:s_t.reshape((1,84,84,4)), coeff:b_IJ1}) a_t = np.zeros([actions]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(actions) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE K = 1 for i in range(0, K): x_t1_col, r_t, terminal, bar1_score, bar2_score = game_state.frame_step(a_t) if(terminal == 1): episode +=1 x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (84, 84)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1,1,255,cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (84, 84, 1)) s_t1 = np.append(x_t1, s_t[:,:,0:3], axis = 2) D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() if t > OBSERVE and t%train_freq==0: # sample a minibatch to train on minibatch = random.sample(D, BATCH) s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict = {s:s_j1_batch, coeff:b_IJ2 }) for i in range(0, len(minibatch)): if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + gamma * np.max(readout_j1_batch[i])) train_step.run(feed_dict = { y : y_batch, a : a_batch, s : s_j_batch, coeff: b_IJ2}) s_t = s_t1 t += 1 if r_t!= 0: print ("Timestep", t"/ Score", bar1_score) if(bar1_score - bar2_score > 17): print("Game_Ends_in Time:",int(time.time() - tick)) break; if(bar1_score - bar2_score > 15): print("Game_Mid_in Time:",int(time.time() - tick))
def play_game(left_player): # open up a game state to communicate with emulator game_state = game.GameState() time.sleep(2) # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) # Prevents error in frame step. Both bars stay in place. do_nothing[0] = 1 game_image_data, _, r_0, terminal, _, _ = game_state.frame_step( do_nothing, do_nothing) # image processing game_image_data = skimage.color.rgb2gray(game_image_data) game_image_data = skimage.transform.resize(game_image_data, (80, 80)) game_image_data = skimage.exposure.rescale_intensity(game_image_data, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen game_image_data[79, i] = 0 # initiating first 4 frames to the same frame last_4_frames = np.stack( (game_image_data, game_image_data, game_image_data, game_image_data), axis=0) # In Keras, need to reshape last_4_frames = last_4_frames.reshape(1, last_4_frames.shape[0], last_4_frames.shape[1], last_4_frames.shape[2]) number_of_games = 3 while number_of_games > 0: actions_vector1 = np.zeros([ACTIONS]) actions_vector2 = np.zeros([ACTIONS]) # choose an action for the left player: q1 = left_player.model.predict( last_4_frames) # input a stack of 4 images, get the prediction max_Q1 = np.argmax(q1) action_index1 = max_Q1 actions_vector1[action_index1] = 1 # action for right player: input from a human - keyboard # events = pygame.event.get() # for event in events: # if event.type == pygame.KEYDOWN: # if event.key == pygame.K_UP: # #action_index2 = 1 # actions_vector2[1] = 1 # if event.key == pygame.K_DOWN: # #action_index2 = 2 # actions_vector2[2] = 1 # if event.key == pygame.K_ESCAPE: # exit() keys = pygame.key.get_pressed() # checking pressed keys if keys[pygame.K_UP]: actions_vector2[1] = 1 if keys[pygame.K_DOWN]: actions_vector2[2] = 1 #actions_vector2[action_index2] = 1 # in order for us to see the game image_data_colored1, _, _, terminal, score, no_learning_time =\ game_state.frame_step(actions_vector1, actions_vector2) game_over = terminal if game_over: time.sleep(5) # image processing x_t1 = skimage.color.rgb2gray(image_data_colored1) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen x_t1[79, i] = 0 x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1]) last_4_frames1 = np.append(x_t1, last_4_frames[:, :3, :, :], axis=1) last_4_frames = last_4_frames1 print('game over!\n')
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) ## Loss Function readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) #Usar como Q? cost = tf.reduce_mean(tf.square(y - readout_action)) ## If I had to use PSO,GA it would be here in order to minimize the cost and update the weights. train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # printing ## a_file = open("logs_" + GAME + "/readout.txt", 'w') ## h_file = open("logs_" + GAME + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() #create a saver sess.run(tf.initialize_all_variables()) ##checkpoint = tf.train.get_checkpoint_state("saved_networks") path_c = "C:/Users/marco/Desktop/Documentos/DeepReinforcedLearning\/DeepLearningVideoGames-master/saved_networks" # Correction for Loading TF session checkpoint = tf.train.latest_checkpoint(path_c, latest_filename="checkpoint") if checkpoint: saver.restore(sess, checkpoint) print("Successfully loaded:") print(checkpoint) else: print("Could not find old network weights") epsilon = INITIAL_EPSILON t = 0 while t < 100000: #"pigs" != "fly": # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE for i in range(0, K): # run the selected action and observe next state and reward x_t1_col, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, 0:3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): # if terminal only equals reward if minibatch[i][4]: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'saved_networks/' + GAME + '-dqn2', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max(readout_t))
def actorLearner(num, sess, lock): # We use global shared O parameter vector # We use global shared Otarget parameter vector # We use global shared counter T, and TMAX constant global TMAX, T # Open up a game state to communicate with emulator lock.acquire() game_state = game.GameState() lock.release() # Initialize network gradients s_j_batch = [] a_batch = [] y_batch = [] # Get the first state by doing nothing and preprocess the image to 80x80x4 lock.acquire() x_t, r_0, terminal = game_state.frame_step([1, 0, 0]) lock.release() x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) aux_s = s_t time.sleep(3 * num) # Initialize target network weights copyTargetNetwork(sess) epsilon_index = random.randrange(EPSILONS) INITIAL_EPSILON = INITIAL_EPSILONS[epsilon_index] FINAL_EPSILON = FINAL_EPSILONS[epsilon_index] epsilon = INITIAL_EPSILON print "THREAD ", num, "STARTING...", "EXPLORATION POLICY => INITIAL_EPSILON:", INITIAL_EPSILON, ", FINAL_EPSILON:", FINAL_EPSILON # Initialize thread step counter t = 0 score = 0 while T < TMAX and score < WISHED_SCORE: # Choose an action epsilon greedily readout_t = O_readout.eval(session=sess, feed_dict={s: [s_t]}) a_t = np.zeros([ACTIONS]) action_index = 0 if random.random() <= epsilon or t <= OBSERVE: action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 # Scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # Run the selected action and observe next state and reward lock.acquire() x_t1_col, r_t, terminal = game_state.frame_step(a_t) lock.release() x_t1 = cv2.cvtColor(cv2.resize(x_t1_col, (80, 80)), cv2.COLOR_BGR2GRAY) x_t1 = np.reshape(x_t1, (80, 80, 1)) aux_s = np.delete(s_t, 0, axis=2) s_t1 = np.append(aux_s, x_t1, axis=2) # Accumulate gradients readout_j1 = Ot_readout.eval(session=sess, feed_dict={st: [s_t1]}) if terminal: y_batch.append(r_t) else: y_batch.append(r_t + GAMMA * np.max(readout_j1)) a_batch.append(a_t) s_j_batch.append(s_t) # Update the old values s_t = s_t1 T += 1 t += 1 score += r_t # Update the Otarget network if T % It == 0: copyTargetNetwork(sess) # Update the O network if t % Iasync == 0 or terminal: if s_j_batch: # Perform asynchronous update of O network train_O.run(session=sess, feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }) #Clear gradients s_j_batch = [] a_batch = [] y_batch = [] # Save progress every 5000 iterations if t % 5000 == 0: saver.save(sess, 'save_networks_asyn/' + GAME + '-dqn', global_step=t) # Print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" if terminal: print "THREAD:", num, "/ TIME", T, "/ TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ Q_MAX %e" % np.max( readout_t), "/ SCORE", score score = 0
def train_sequentially(left_player, right_player, first_learning_player): if os.path.exists('logs'): if os.path.exists('old_logs'): shutil.rmtree('old_logs') os.mkdir('old_logs', 0755) copytree('logs', 'old_logs') shutil.rmtree('logs') os.mkdir('logs', 0755) # moving old trials to old_trials folder if os.path.exists('trials_sequentially'): if os.path.exists('old_trials_sequentially'): shutil.rmtree('old_trials_sequentially') os.mkdir('old_trials_sequentially', 0755) copytree('trials_sequentially', 'old_trials_sequentially') shutil.rmtree('trials_sequentially') os.mkdir('trials_sequentially', 0755) current_log_folder = '' left_player.num_of_trains = 1 if (first_learning_player == CurrentPlayer.Left): current_training_player = CurrentPlayer.Left left_player.num_of_trains += 1 current_log_folder = 'logs/' + 'left_player' + \ '_learning' + str(left_player.num_of_trains) elif (first_learning_player == CurrentPlayer.Right): current_training_player = CurrentPlayer.Right right_player.num_of_trains += 1 current_log_folder = 'logs/' + 'right_player' + \ '_learning' + str(right_player.num_of_trains) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D1 = deque() D2 = deque() # get the first state (do nothing) and pre-process the image to 4x80x80 do_nothing = np.zeros(NUM_OF_ACTIONS) do_nothing[0] = 1 single_game_frame, _, _, terminal, _, _ = game_state.frame_step( do_nothing, do_nothing) single_game_frame = skimage.color.rgb2gray(single_game_frame) single_game_frame = skimage.transform.resize(single_game_frame, (IMAGE_WIDTH, IMAGE_HEIGHT)) single_game_frame = skimage.exposure.rescale_intensity(single_game_frame, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen single_game_frame[79, i] = 0 # stacking up 4 images together to form a state: 4 images = state current_state = np.stack((single_game_frame, single_game_frame, single_game_frame, single_game_frame), axis=0) current_state = current_state.reshape(1, current_state.shape[0], current_state.shape[1], current_state.shape[2]) epsilon = INITIAL_EPSILON observation_counter = 0 num_folder = 0 start_time = datetime.datetime.now() start_time_loss = datetime.datetime.now() losses = [] episode_number = 0 exploration_counter = 0 exploration_flag = 0 #j = 1 # os.mkdir("pic/", 0755); # t = 0 # pic_counter = 0 while True: loss = 0 Q_sa = 0 # player1 - left player action_index1 = 0 reward1 = 0 action_left_player = np.zeros([NUM_OF_ACTIONS]) # player2 - right player action_index2 = 0 reward2 = 0 action_right_player = np.zeros([NUM_OF_ACTIONS]) # choose an action epsilon greedy if (observation_counter % FRAME_PER_ACTION) == 0: if current_training_player == CurrentPlayer.Left: if random.random() <= epsilon: action_index1 = random.randrange(NUM_OF_ACTIONS) action_left_player[action_index1] = 1 else: q = left_player.model.predict( current_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index1 = max_Q action_left_player[action_index1] = 1 # right player: q = right_player.model.predict( current_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index2 = max_Q action_right_player[action_index2] = 1 elif current_training_player == CurrentPlayer.Right: if random.random() <= epsilon or exploration_flag == 1: action_index2 = random.randrange(NUM_OF_ACTIONS) action_right_player[action_index2] = 1 else: q = right_player.model.predict( current_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index2 = max_Q action_right_player[action_index2] = 1 # left player: q = left_player.model.predict( current_state ) # input a stack of 4 images, get the prediction max_Q = np.argmax(q) action_index1 = max_Q action_left_player[action_index1] = 1 # we reduced the epsilon gradually if (epsilon > FINAL_EPSILON) and (observation_counter > OBSERVATION): epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE single_game_frame_colored, reward1, reward2, terminal, score, _ = \ game_state.frame_step(action_left_player, action_right_player) single_game_frame_gray = skimage.color.rgb2gray( single_game_frame_colored) thresh = threshold_otsu(single_game_frame_gray) single_game_frame = single_game_frame_gray > thresh single_game_frame = skimage.transform.resize(single_game_frame, (80, 80)) single_game_frame = skimage.exposure.rescale_intensity( single_game_frame, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen single_game_frame[79, i] = 0 # if (score >= 0): # fig1 = plt.figure(pic_counter) # # plt.imshow(x_t1_colored) # plt.imshow(single_game_frame) # print('time now: ', datetime.datetime.now()) # #fig1.savefig('pic/' + str(j) + '/' + str(pic_counter) + 'colored pic.png') # fig1.savefig('pic/' + str(pic_counter) + 'colored pic.png') # # plt.close() # # t = t + 1 # pic_counter += 1 single_game_frame = single_game_frame.reshape( 1, 1, single_game_frame.shape[0], single_game_frame.shape[1]) # next 4 images = next state next_state = np.append(single_game_frame, current_state[:, :3, :, :], axis=1) if current_training_player == CurrentPlayer.Left: D1.append( (current_state, action_index1, reward1, next_state, terminal)) if len(D1) > REPLAY_MEMORY: D1.popleft() elif current_training_player == CurrentPlayer.Right: D2.append( (current_state, action_index2, reward2, next_state, terminal)) if len(D2) > REPLAY_MEMORY: D2.popleft() # only train if done observing if observation_counter > OBSERVATION: # sample a minibatch to train on - eliminates states correlation minibatch = None if current_training_player == CurrentPlayer.Left: minibatch = random.sample(D1, BATCH) elif current_training_player == CurrentPlayer.Right: minibatch = random.sample(D2, BATCH) inputs = np.zeros( (BATCH, current_state.shape[1], current_state.shape[2], current_state.shape[3])) # 32, 4, 80, 80 targets = np.zeros((inputs.shape[0], NUM_OF_ACTIONS)) # 32, 2 # experience replay for i in range(0, len(minibatch)): current_state_t = minibatch[i][0] action_t = minibatch[i][1] # This is action index reward_t = minibatch[i][2] next_state_t = minibatch[i][3] terminal_t = minibatch[i][4] inputs[i:i + 1] = current_state_t if (current_training_player == CurrentPlayer.Left): targets[i] = left_player.model.predict(current_state_t) Q_sa = left_player.model.predict(next_state_t) elif (current_training_player == CurrentPlayer.Right): targets[i] = right_player.model.predict(current_state_t) Q_sa = right_player.model.predict(next_state_t) if terminal_t: targets[i, action_t] = reward_t else: targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa) if (current_training_player == CurrentPlayer.Left): loss += left_player.model.train_on_batch(inputs, targets) elif (current_training_player == CurrentPlayer.Right): loss += right_player.model.train_on_batch(inputs, targets) # log_file_loss_path = current_log_folder + '/loss' # log_file_qmax_path = current_log_folder + '/qmax' # num_of_lines_in_loss_file = 0 # loss_file = None # # if not os.path.exists(current_log_folder): # os.mkdir(current_log_folder, 0755) # # elapsed_time_loss = (current_time - start_time_loss).total_seconds() # # if elapsed_time_loss >= 1 * 60: # episode_number += 1 # start_time_loss = datetime.datetime.now() # loss_file = open(log_file_loss_path, 'a') # num_of_lines_in_loss_file = num_of_lines_in_file(log_file_loss_path) # # #with open(log_file_loss_path, 'a') as loss_file: # loss_file.write(str(episode_number) + ' : ' + str(loss) + '\n') # loss_file.flush() # loss_file.close() current_state = next_state observation_counter = observation_counter + 1 # save progress (updating weights files) every 100 iterations if observation_counter % 100 == 0: if current_training_player == CurrentPlayer.Left: left_player.model.save_weights('model1.h5', overwrite=True) with open('model1.json', 'w') as outfile: json.dump(left_player.model.to_json(), outfile) elif current_training_player == CurrentPlayer.Right: right_player.model.save_weights('model2.h5', overwrite=True) with open('model2.json', 'w') as outfile: json.dump(right_player.model.to_json(), outfile) current_time = datetime.datetime.now() elapsed_time = (current_time - start_time).total_seconds() if elapsed_time >= 5 * 60: start_time = datetime.datetime.now() num_folder = save_weights_file(num_folder, current_training_player, left_player, right_player) if terminal: # game over if score[0] < score[1]: right_player.num_of_wins_in_a_row += 1 left_player.num_of_wins_in_a_row = 0 elif score[1] < score[0]: left_player.num_of_wins_in_a_row += 1 right_player.num_of_wins_in_a_row = 0 print('game ended:\n') print('right player wins in a row: ', right_player.num_of_wins_in_a_row, '\n') print('left player wins in a row: ', left_player.num_of_wins_in_a_row, '\n') if (current_training_player == CurrentPlayer.Left and left_player.num_of_wins_in_a_row == 30): _ = save_weights_file(num_folder, current_training_player, left_player, right_player) # plot_loss(current_log_folder, current_log_folder + '/loss') # subprocess.call('. ~/flappy/bin/activate && ' + TEST_SEQUENTIAL_COMMAND + ' ' + str(1) + # ' ' + str(left_player.num_of_trains) + ' ' + str(right_player.num_of_trains), # shell=True) left_player.num_of_wins_in_a_row = 0 right_player.num_of_wins_in_a_row = 0 observation_counter = 0 epsilon = INITIAL_EPSILON num_folder = 0 current_training_player = CurrentPlayer.Right right_player.num_of_trains += 1 current_log_folder = 'logs/' + 'right_player' + \ '_learning' + str(right_player.num_of_trains) episode_number = 0 D1.clear() start_time = datetime.datetime.now() break elif (current_training_player == CurrentPlayer.Right and right_player.num_of_wins_in_a_row == 30): # plot_loss(current_log_folder, current_log_folder + '/loss') _ = save_weights_file(num_folder, current_training_player, left_player, right_player) # subprocess.call('. ~/flappy/bin/activate && ' + TEST_SEQUENTIAL_COMMAND + ' ' + str(2) + # ' ' + str(left_player.num_of_trains) + ' ' + str(right_player.num_of_trains), # shell=True) left_player.num_of_wins_in_a_row = 0 right_player.num_of_wins_in_a_row = 0 observation_counter = 0 epsilon = INITIAL_EPSILON num_folder = 0 current_training_player = CurrentPlayer.Left left_player.num_of_trains += 1 current_log_folder = 'logs/' + 'left_player' + \ '_learning' + str(left_player.num_of_trains) episode_number = 0 D2.clear() start_time = datetime.datetime.now() break print("Episode finished!") print("************************")
def run_test(learnig_player, left_player, right_player, num_of_test, test_player_log_file): if not os.path.exists(test_player_log_file): os.makedirs(test_player_log_file, 0755) test_start_time = datetime.datetime.now() no_learning_time = 0 # open up a game state to communicate with emulator game_state = game.GameState() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) # Prevents error in frame step. Both bars stay in place. do_nothing[0] = 1 game_image_data, _, r_0, terminal, _, _ = game_state.frame_step( do_nothing, do_nothing) # image processing game_image_data = skimage.color.rgb2gray(game_image_data) game_image_data = skimage.transform.resize(game_image_data, (80, 80)) game_image_data = skimage.exposure.rescale_intensity(game_image_data, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen game_image_data[79, i] = 0 # initiating first 4 frames to the same frame last_4_frames = np.stack( (game_image_data, game_image_data, game_image_data, game_image_data), axis=0) # In Keras, need to reshape last_4_frames = last_4_frames.reshape(1, last_4_frames.shape[0], last_4_frames.shape[1], last_4_frames.shape[2]) num_folder = 0 left_player_scores = [] right_player_scores = [] time_list = [] game_score = [] left_player_q_max_list = [] right_player_q_max_list = [] number_of_games = 10 #number_of_games = 10 #for epsilons tests original_number_of_games = number_of_games game_start_time = datetime.datetime.now() left_player.num_of_wins = 0 right_player.num_of_wins = 0 while (number_of_games > 0): actions_vector1 = np.zeros([ACTIONS]) actions_vector2 = np.zeros([ACTIONS]) # choose an action epsilon greedy for player 1: q1 = left_player.model.predict( last_4_frames) # input a stack of 4 images, get the prediction max_Q1 = np.argmax(q1) action_index1 = max_Q1 q_max1 = np.amax(q1) left_player_q_max_list.append((num_folder, q_max1)) # actions_vector1: input vector for frame_step function. contains the desired action for player 1 # e.g [0,1,0]- up actions_vector1[action_index1] = 1 # choose an action epsilon greedy for player 2: q2 = right_player.model.predict( last_4_frames) # input a stack of 4 images, get the prediction max_Q2 = np.argmax(q2) action_index2 = max_Q2 q_max2 = np.amax(q2) right_player_q_max_list.append((num_folder, q_max2)) # actions_vector2: input vector for frame_step function. contains the desired action # e.g [0,1,0]- up actions_vector2[action_index2] = 1 with open(test_player_log_file + '/qmax', 'a') as qmax_log_file: if learnig_player == 1: qmax_log_file.write( str(num_of_test) + ' : ' + str(q_max1) + '\n') else: qmax_log_file.write( str(num_of_test) + ' : ' + str(q_max2) + '\n') # in order for us to see the game image_data_colored1, _, _, terminal, score, no_learning_time = game_state.frame_step( actions_vector1, actions_vector2) game_over = terminal if (game_over == True): number_of_games -= 1 print( str(datetime.datetime.now()) + " game ended: " + str(number_of_games) + " games left for the test") if (score[0] > score[1]): left_player.num_of_wins += 1 else: right_player.num_of_wins += 1 with open(test_player_log_file + "/" + "game_over_log", "a") as game_over_file: game_over_file.write("score: " + str(score) + " game duration: " + str((datetime.datetime.now() - game_start_time).total_seconds() - no_learning_time) + " [sec]" + "\n") game_over_file.flush() left_player_scores.append(score[0]) right_player_scores.append(score[1]) game_score.append(score) current_time = datetime.datetime.now() elapsedTime = (current_time - game_start_time).total_seconds() - no_learning_time time_list.append(elapsedTime) game_start_time = datetime.datetime.now() # image processing x_t1 = skimage.color.rgb2gray(image_data_colored1) x_t1 = skimage.transform.resize(x_t1, (80, 80)) x_t1 = skimage.exposure.rescale_intensity(x_t1, out_range=(0, 255)) for i in range(80): # erasing the line in the right side of the screen x_t1[79, i] = 0 x_t1 = x_t1.reshape(1, 1, x_t1.shape[0], x_t1.shape[1]) last_4_frames1 = np.append(x_t1, last_4_frames[:, :3, :, :], axis=1) last_4_frames = last_4_frames1 if (number_of_games == 0): average_time = np.mean(time_list) left_player_average_score = np.mean(left_player_scores) right_player_average_score = np.mean(right_player_scores) print("left_player_num_of_wins: ", left_player.num_of_wins) print("\nright_player_num_of_wins: ", right_player.num_of_wins) left_player_win_percentage = (left_player.num_of_wins / float(original_number_of_games)) * 100 right_player_win_percentage = (right_player.num_of_wins / float(original_number_of_games)) * 100 print("\n\nleft_player_win_percentage: ", left_player_win_percentage) print("\nright_player_win_percentage: ", right_player_win_percentage) with open(test_player_log_file + "/" + "game_summary", "a") as results_file: results_file.write("\n" + "Game Summary" + str(num_of_test) + ":" + "\n" + " left player average score: " + str(left_player_average_score) + " [points]" + "\n" + " left player win percentage: " + str(left_player_win_percentage) + "%" + "\n" + " right player average score: " + str(right_player_average_score) + " [points]" + "\n" + " right player win percentage: " + str(right_player_win_percentage) + "%" + "\n" + " average time: " + str(average_time) + "[sec]" + "\n" + "\n" + "\n") return time_list, game_score, left_player_q_max_list, right_player_q_max_list