def running(): game_state = game.Main() t = 0 episode = 0 total_reward = 0 reward_array = [] max_q_array = [] time_line_q = [] time_line_r = [] # get the first state by doing nothing do_nothing = np.zeros(ACTIONS) do_nothing[1] = 1 x_t, r_0, terminal, ball_x, bat_mid = game_state.frame_step(do_nothing) while t <= RUNNING: a_t = np.zeros([ACTIONS]) action_index = 1 if t % FRAME_PER_ACTION == 0: # choosing the human action with PROBABILITY if random.random() <= PROBABILITY: print("----------Human Action----------") if ball_x < bat_mid: a_t = [1, 0, 0] # move to left elif ball_x > bat_mid: a_t = [0, 0, 1] # move to right else: a_t = [0, 1, 0] # do nothing else : print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: a_t[1] = 1 # do nothing
def trainNetwork(s, readout, h_fc1, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS]) y = tf.placeholder("float", [None]) readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - readout_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # open up a game state to communicate with emulator game_state = game.Main() # store the previous observations in replay memory D = deque() # printing # a_file = open("logs_" + GAME + "/readout.txt", 'w') # h_file = open("logs_" + GAME + "/hidden.txt", 'w') # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[1] = 1 x_t, r_0, terminal, ball_x, bat_mid = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # saving and loading networks saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) checkpoint = tf.train.get_checkpoint_state("DQN_saved_networks") # if checkpoint and checkpoint.model_checkpoint_path: # saver.restore(sess, checkpoint.model_checkpoint_path) # print("Successfully loaded:", checkpoint.model_checkpoint_path) # else: # print("Could not find old network weights") # start training epsilon = INITIAL_EPSILON t = 0 episode = 0 total_reward = 0 reward_array = [] max_q_array = [] time_line_q = [] time_line_r = [] while t <= OBSERVE + EXPLORE + TRAINING: # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([ACTIONS]) action_index = 1 if t % FRAME_PER_ACTION == 0: if random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 else: a_t[1] = 1 # do nothing # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # run the selected action and observe next state and reward x_t1_colored, r_t, terminal, ball_x, bat_mid = game_state.frame_step( a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] # plot aggregated reward per episode if not terminal: total_reward += r_t else: episode += 1 time_line_r.append(episode) reward_array.append(total_reward) total_reward = 0 # plot per 1000 frame if t % 1000 == 0: max_q_value = np.max(readout_t) max_q_array.append(max_q_value) time_line_q.append(t // 1000) y_batch = [] readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch}) for i in range(0, len(minibatch)): terminal = minibatch[i][4] # if terminal, only equals reward if terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run(feed_dict={y: y_batch, a: a_batch, s: s_j_batch}) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'DQN_saved_networks/' + GAME + '-dqn', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, \ "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \ "/ Q_MAX %e" % np.max(readout_t)) # write info to files ''' if t % 10000 <= 100: a_file.write(",".join([str(x) for x in readout_t]) + '\n') h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n') cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1) ''' # restore lists time_line_r_file = open('.\\results\lists\\5.5start\\time_line_r.txt', 'w') for word in time_line_r: time_line_r_file.write(str(word)) time_line_r_file.write('\n') time_line_r_file.close() time_line_q_file = open('.\\results\lists\\5.5start\\time_line_q.txt', 'w') for word in time_line_q: time_line_q_file.write(str(word)) time_line_q_file.write('\n') time_line_q_file.close() reward_array_file = open( '.\\results\lists\\5.5start\\reward_array.txt', 'w') for word in reward_array: reward_array_file.write(str(word)) reward_array_file.write('\n') reward_array_file.close() max_q_array_file = open('.\\results\lists\\5.5start\\max_q_array.txt', 'w') for word in max_q_array: max_q_array_file.write(str(word)) max_q_array_file.write('\n') max_q_array_file.close() # plot result plt.figure() plt.xlabel("step") plt.ylabel("max Q value") plt.title("DQN max Q value change") plt.plot(time_line_q, max_q_array) plt.savefig('./DQN_max_Q_value.png') plt.figure() plt.xlabel("episode") plt.ylabel("reward") plt.title("DQN reward per episode change") plt.plot(time_line_r, reward_array) plt.savefig('./QDN_reward.png') plt.show()
def trainNetwork(s, readout, W_fc1, W_fc2, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS], name='action') y = tf.placeholder("float", [None], name='q_next') tf.summary.histogram('q_next', y) with tf.name_scope('q_eval'): readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) tf.summary.histogram('fc2/output', readout_action) with tf.name_scope('loss'): cost = tf.reduce_mean(tf.square(y - readout_action)) tf.summary.scalar('loss', cost) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # network difference regularize_lambda = 1.0 regularizer = tf.contrib.layers.l2_regularizer( regularize_lambda) # equal to tf.nn.l2_loss with tf.name_scope('weight'): last_W_fc1 = tf.Variable(tf.constant(0.0, shape=W_fc1.get_shape())) diff_W_fc1 = tf.contrib.layers.apply_regularization( regularizer, [W_fc1 - last_W_fc1]) tf.summary.scalar('diff_W_fc1', diff_W_fc1) last_W_fc1_update = tf.assign(last_W_fc1, W_fc1) last_W_fc2 = tf.Variable(tf.constant(0.0, shape=W_fc2.get_shape())) diff_W_fc2 = tf.contrib.layers.apply_regularization( regularizer, [W_fc2 - last_W_fc2]) tf.summary.scalar('diff_W_fc2', diff_W_fc2) last_W_fc2_update = tf.assign(last_W_fc2, W_fc2) # tensorboard output merged = tf.summary.merge_all() writer = tf.summary.FileWriter(r"result/Exp10Graph/", sess.graph) # record the reward with tf.name_scope('reward_per_life'): reward = tf.Variable(0.0, name='reward') reward_sum = tf.summary.scalar('reward_per_life', reward) # record the reward every 1000 time steps with tf.name_scope('reward_per_10000_steps'): reward_step = tf.Variable(0.0, name='reward_step') reward_sum_step = tf.summary.scalar('reward_per_10000_steps', reward_step) # placeholder to record reward reward_count = tf.placeholder('float') zero = tf.Variable(0.0, name='zero') re_count = 0.0 life_count = 1 reward_update = tf.assign(reward, reward + reward_count) reward_fresh = tf.assign(reward, zero) # placeholder to record reward_step reward_count_step = tf.placeholder('float') re_count_step = 0.0 reward_update_step = tf.assign(reward_step, reward_step + reward_count_step) reward_fresh_step = tf.assign(reward_step, zero) # record average max q value with tf.name_scope('50kper_average_qMax'): q_max = tf.Variable(0.0, name='q_max_average') q_max_sum = tf.summary.scalar('50kper_average_qMax', q_max) # placeholder to record q value q_max_count = tf.placeholder('float') q_count = 0.0 batch_count = 0 q_max_update = tf.assign(q_max, q_max_count) # open up a game state to communicate with emulator game_state = game.Main() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[1] = 1 x_t_colored, r_0, terminal, ball_x, bat_mid = game_state.frame_step( do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # variable to save pygame frame # game_frame = x_t_colored # saving and loading networks saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state(r"result/Exp10_saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training epsilon = INITIAL_EPSILON omega = INITIAL_OMEGA t = 0 episode_reward = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[ 0] # returns output of current input(images). a_t = np.zeros([ACTIONS]) action_index = 1 if t % FRAME_PER_ACTION == 0: if random.random() <= omega and t > OBSERVE: print("----------Human Action----------") if ball_x < bat_mid: a_t = [1, 0, 0] # move to left elif ball_x > bat_mid: a_t = [0, 0, 1] # move to right else: a_t = [0, 1, 0] # do nothing elif random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[random.randrange(ACTIONS)] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 else: a_t[1] = 1 # do nothing # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # scale down omega # if omega - epsilon > 0 and t > OBSERVE: # omega -= (INITIAL_OMEGA - FINAL_OMEGA) / EXPLORE if t > OBSERVE: omega = INITIAL_OMEGA * (DECAY_RATE**(t / DECAY_STEPS)) # run the selected action and observe next state and reward x_t1_colored, r_t, terminal, ball_x1, bat_mid1 = game_state.frame_step( a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # update pygame frame # game_frame = x_t1_colored episode_reward += r_t # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # experience replay. # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval( feed_dict={s: s_j1_batch}) # readout_j1_batch: Q value? # average max q value batch_count += 1 q_count += np.max(readout_t) BATCH_N = 50000 if batch_count % BATCH_N == 0: sess.run(q_max_update, feed_dict={q_max_count: float(q_count / BATCH_N)}) qm = sess.run(q_max_sum) writer.add_summary(qm, batch_count) q_count = 0.0 # total reward re_count += r_t re_count_step += r_t if terminal: sess.run(reward_update, feed_dict={reward_count: float(re_count)}) re = sess.run(reward_sum) writer.add_summary(re, life_count) sess.run(reward_fresh) re_count = 0 life_count += 1 if t % 10000 == 0: sess.run(reward_update_step, feed_dict={reward_count_step: float(re_count_step)}) re_step = sess.run(reward_sum_step) writer.add_summary(re_step, t * 10000) sess.run(reward_fresh_step) re_count_step = 0 for i in range(0, len(minibatch)): ith_terminal = minibatch[i][4] # if terminal, only equals reward if ith_terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run( feed_dict={ # feed back to update network. y: y_batch, a: a_batch, s: s_j_batch }) # update tensorboard data per 1000 steps if t % 1000 == 0: result = sess.run(merged, feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }) writer.add_summary(result, t) # record network weight if t % 1000 == 0: sess.run([last_W_fc1_update, last_W_fc2_update]) # update the old values s_t = s_t1 t += 1 ball_x = ball_x1 bat_mid = bat_mid1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'result/Exp10_saved_networks/' + GAME + '-dqn', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, "/ EPISODE_REWARD", episode_reward, "/ Q_MAX %e" % np.max(readout_t), "/ TERMINAL", terminal) if terminal: episode_reward = 0