def playPlane(): # Step 1: init BrainDQN actions = 3 brain = BrainDQN(actions) # Step 2: init Plane Game plane = game.GameState() # Step 3: play game # Step 3.1: obtain init state action0 = np.array([1, 0, 0]) # [1,0,0]do nothing,[0,1,0]left,[0,0,1]right observation0, reward0, terminal = plane.frame_step(action0) observation0 = cv2.cvtColor(cv2.resize(observation0, (80, 80)), cv2.COLOR_BGR2GRAY) ret, observation0 = cv2.threshold(observation0, 1, 255, cv2.THRESH_BINARY) brain.setInitState(observation0) # Step 3.2: run the game while 1 != 0: action = brain.getAction() nextObservation, reward, terminal = plane.frame_step(action) nextObservation = preprocess(nextObservation) brain.setPerception(nextObservation, action, reward, terminal)
def main(): begin_time = datetime.datetime.now() env = game.GameState() #env.display = ~TRAINING brain = rl_brain_pytorch.DeepQNetwork() step = 0 for episode in range(rl_brain_pytorch.MAX_EPISODE): # do nothing observation, _, _ = env.frame_step([1, 0, 0]) observation = preprocess(observation, False) brain.reset(observation) score = 0.0 while True: action = brain.choose_action(observation) observation_, reward, done = env.frame_step(action) if reward == 1: score += 1 observation_ = preprocess(observation_, True) if TRAINING: brain.store_transition(observation, action, reward, done, observation_) # 有一定的记忆就可以开始学习了 if step > 200: if TRAINING: brain.learn() if done: break observation = observation_ step += 1 end_time = datetime.datetime.now() print("episode {} over. exec time:{} step:{} score:{}".format( episode, end_time - begin_time, step, score)) brain.saveNet() env.exit("game over")
def main(): begin_time = datetime.datetime.now() env = game.GameState() brain = DeepQNetwork(n_actions=N_ACTIONS, memory_size=MEMORY_SIZE, minibatch_size=MINIBATCH_SIZE, gamma=GAMMA, epsilon=INITIAL_EPSILON) step = 0 for episode in range(MAX_EPISODE): # do nothing observation, _, _ = env.frame_step([1, 0, 0]) observation = preprocess(observation, False) brain.reset(observation) while True: action = brain.choose_action(observation) observation_, reward, done = env.frame_step(action) observation_ = preprocess(observation_, True) brain.store_transition(observation, action, reward, done, observation_) # 有一定的记忆就可以开始学习了 if step > 200: brain.learn() if done: break observation = observation_ step += 1 end_time = datetime.datetime.now() print("episode {} over. exec time:{} step:{}".format( episode, end_time - begin_time, step)) env.exit("game over")
def trainNetwork(s, readout, W_fc1, W_fc2, sess): # define the cost function a = tf.placeholder("float", [None, ACTIONS], name='action') y = tf.placeholder("float", [None], name='q_next') tf.summary.histogram('q_next', y) with tf.name_scope('q_eval'): readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) tf.summary.histogram('fc2/output', readout_action) with tf.name_scope('loss'): cost = tf.reduce_mean(tf.square(y - readout_action)) tf.summary.scalar('loss', cost) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(cost) # network difference regularize_lambda = 1.0 regularizer = tf.contrib.layers.l2_regularizer( regularize_lambda) # equal to tf.nn.l2_loss with tf.name_scope('weight'): last_W_fc1 = tf.Variable(tf.constant(0.0, shape=W_fc1.get_shape())) diff_W_fc1 = tf.contrib.layers.apply_regularization( regularizer, [W_fc1 - last_W_fc1]) tf.summary.scalar('diff_W_fc1', diff_W_fc1) last_W_fc1_update = tf.assign(last_W_fc1, W_fc1) last_W_fc2 = tf.Variable(tf.constant(0.0, shape=W_fc2.get_shape())) diff_W_fc2 = tf.contrib.layers.apply_regularization( regularizer, [W_fc2 - last_W_fc2]) tf.summary.scalar('diff_W_fc2', diff_W_fc2) last_W_fc2_update = tf.assign(last_W_fc2, W_fc2) # tensorboard output merged = tf.summary.merge_all() writer = tf.summary.FileWriter(r"result/Exp14Graph/", sess.graph) # record the reward with tf.name_scope('reward_per_life'): reward = tf.Variable(0.0, name='reward') reward_sum = tf.summary.scalar('reward_per_life', reward) # record the reward every 1000 time steps with tf.name_scope('reward_per_10000_steps'): reward_step = tf.Variable(0.0, name='reward_step') reward_sum_step = tf.summary.scalar('reward_per_10000_steps', reward_step) # placeholder to record reward reward_count = tf.placeholder('float') zero = tf.Variable(0.0, name='zero') re_count = 0.0 life_count = 1 reward_update = tf.assign(reward, reward + reward_count) reward_fresh = tf.assign(reward, zero) # placeholder to record reward_step reward_count_step = tf.placeholder('float') re_count_step = 0.0 reward_update_step = tf.assign(reward_step, reward_step + reward_count_step) reward_fresh_step = tf.assign(reward_step, zero) # record average max q value with tf.name_scope('50kper_average_qMax'): q_max = tf.Variable(0.0, name='q_max_average') q_max_sum = tf.summary.scalar('50kper_average_qMax', q_max) # placeholder to record q value q_max_count = tf.placeholder('float') q_count = 0.0 batch_count = 0 q_max_update = tf.assign(q_max, q_max_count) # open up a game state to communicate with emulator game_state = game.GameState() # store the previous observations in replay memory D = deque() # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(ACTIONS) do_nothing[0] = 1 x_t_colored, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # variable to save game frame game_frame = x_t_colored # saving and loading networks saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state(r"result/Exp14_saved_networks") if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training epsilon = INITIAL_EPSILON omega = INITIAL_OMEGA t = 0 episode_reward = 0 while True: # choose an action epsilon greedily readout_t = readout.eval(feed_dict={s: [s_t]})[ 0] # returns output of current input(images). a_t = np.zeros([ACTIONS]) action_index = 0 ruleAction, safeAction = RuleAction3.rule_action(game_frame) if t % FRAME_PER_ACTION == 0: # choose action based on rule with a probability of omega - epsilon if safeAction == "safe": if random.random() <= omega: if ruleAction == "left": action_index = 1 a_t[action_index] = 1 elif ruleAction == "right": action_index = 2 a_t[action_index] = 1 print("----------Rule Action----------") elif random.random() <= epsilon: print("----------Random Action----------") action_index = random.randrange(ACTIONS) a_t[action_index] = 1 else: action_index = np.argmax(readout_t) a_t[action_index] = 1 else: print("----------safe Action----------") if safeAction == "left": action_index = 1 a_t[action_index] = 1 elif safeAction == "right": action_index = 2 a_t[action_index] = 1 else: a_t[0] = 1 # do nothing # scale down epsilon if epsilon > FINAL_EPSILON and t > OBSERVE: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # scale down omega # if omega - epsilon > 0 and t > OBSERVE: # omega -= (INITIAL_OMEGA - FINAL_OMEGA) / EXPLORE if t > OBSERVE: omega = INITIAL_OMEGA * (DECAY_RATE**(t / DECAY_STEPS)) # run the selected action and observe next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY) x_t1 = np.reshape(x_t1, (80, 80, 1)) s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) # store the transition in D D.append((s_t, a_t, r_t, s_t1, terminal)) if len(D) > REPLAY_MEMORY: D.popleft() # update game frame game_frame = x_t1_colored episode_reward += r_t # only train if done observing if t > OBSERVE: # sample a minibatch to train on minibatch = random.sample(D, BATCH) # experience replay. # get the batch variables s_j_batch = [d[0] for d in minibatch] a_batch = [d[1] for d in minibatch] r_batch = [d[2] for d in minibatch] s_j1_batch = [d[3] for d in minibatch] y_batch = [] readout_j1_batch = readout.eval( feed_dict={s: s_j1_batch}) # readout_j1_batch: Q value? # average max q value batch_count += 1 q_count += np.max(readout_t) BATCH_N = 50000 if batch_count % BATCH_N == 0: sess.run(q_max_update, feed_dict={q_max_count: float(q_count / BATCH_N)}) qm = sess.run(q_max_sum) writer.add_summary(qm, batch_count) q_count = 0.0 # total reward re_count += r_t re_count_step += r_t if terminal: sess.run(reward_update, feed_dict={reward_count: float(re_count)}) re = sess.run(reward_sum) writer.add_summary(re, life_count) sess.run(reward_fresh) re_count = 0 life_count += 1 if t % 10000 == 0: sess.run(reward_update_step, feed_dict={reward_count_step: float(re_count_step)}) re_step = sess.run(reward_sum_step) writer.add_summary(re_step, t * 10000) sess.run(reward_fresh_step) re_count_step = 0 for i in range(0, len(minibatch)): ith_terminal = minibatch[i][4] # if terminal, only equals reward if ith_terminal: y_batch.append(r_batch[i]) else: y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i])) # perform gradient step train_step.run( feed_dict={ # feed back to update network. y: y_batch, a: a_batch, s: s_j_batch }) # update tensorboard data per 100 steps if t % 1000 == 0: result = sess.run(merged, feed_dict={ y: y_batch, a: a_batch, s: s_j_batch }) writer.add_summary(result, t) # record network weight if t % 1000 == 0: sess.run([last_W_fc1_update, last_W_fc2_update]) # update the old values s_t = s_t1 t += 1 # save progress every 10000 iterations if t % 10000 == 0: saver.save(sess, 'result/Exp14_saved_networks/' + GAME + '-dqn', global_step=t) # print info state = "" if t <= OBSERVE: state = "observe" elif t > OBSERVE and t <= OBSERVE + EXPLORE: state = "explore" else: state = "train" print("TIMESTEP", t, "/ STATE", state, "/ EPSILON", epsilon, "/OMEGA", omega, "/ ACTION", action_index, "/ REWARD", r_t, "/ EPISODE_REWARD", episode_reward, "/ Q_MAX %e" % np.max(readout_t), "/ TERMINAL", terminal) if terminal: episode_reward = 0