def trainDeepModel(load=False): # Used to see how long model takes to train - model needs to be optimized! start_time = time.time() print("\n ---- Training the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = False # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap=WRAP, grid_size=GRID_SIZE, rate=80, max_time=100, tail=TAIL, action_space=4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action epsilon_function = True epsilon_start = 0.5 epsilon_end = 0.05 epsilon_percentage = 0.5 # in decimal alpha_function = False alpha_start = 0.01 alpha_end = 0.003 alpha_percentage = 0.9 # in decimal # Create NN model with tf.name_scope('Model'): Q_values, weights, biases = createDeepModel(x, load_variables=load) # Error / Loss function # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value with tf.name_scope('Error'): # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) tf.summary.scalar('error', tf.squeeze(error)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 # error plot # errors = [] print_episode = 1000 total_episodes = 100000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) print("\nProgram took {0:.4f} seconds to initialise\n".format( time.time() - start_time)) start_time = time.time() # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False # Linear function for alpha if alpha_function: alpha = (-alpha_start / (alpha_percentage * total_episodes)) * episode + ( alpha_start + alpha_end) if alpha < alpha_end: alpha = alpha_end # Linear function for epsilon if epsilon_function: epsilon = (-(epsilon_start - epsilon_end) / (epsilon_percentage * total_episodes)) * episode + ( epsilon_start) if epsilon < epsilon_end: epsilon = epsilon_end while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector_3D() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector", Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state # if final state of the episode if done: Q_vector[:, action] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector new_state_vector = env.state_vector_3D() y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL Equation Q_vector[:, action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={ x: state_vector, y: Q_vector }) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) # DEBUGGING # print("action:", action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!! # errors.append(e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = time.time() - start_time print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(epsilon), # "\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60)) end="") if current_time % 60 < 10: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npy files (can't save 4D array to text file) W_conv1 = np.array(sess.run(weights['W_conv1'])) W_conv2 = np.array(sess.run(weights['W_conv2'])) W_fc = np.array(sess.run(weights['W_fc'])) W_out = np.array(sess.run(weights['W_out'])) b_conv1 = np.array(sess.run(biases['b_conv1'])) b_conv2 = np.array(sess.run(biases['b_conv2'])) b_fc = np.array(sess.run(biases['b_fc'])) b_out = np.array(sess.run(biases['b_out'])) np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32)) np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32)) np.save(W_fc_textfile_path_save, W_fc.astype(np.float32)) np.save(W_out_textfile_path_save, W_out.astype(np.float32)) np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32)) np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32)) np.save(b_fc_textfile_path_save, b_fc.astype(np.float32)) np.save(b_out_textfile_path_save, b_out.astype(np.float32)) s = sess.run(merged_summary, feed_dict={ x: state_vector, y: Q_vector }) writer.add_summary(s, episode) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
def runDeepModel(): # Testing print("\n ---- Running the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap=WRAP, grid_size=GRID_SIZE, rate=80, max_time=100, tail=TAIL, action_space=4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.001 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values, weights, biases = createDeepModel(x, load_variables=True) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 1 total_episodes = 10 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") sess.run(init) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector_3D() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: print("Ep:", episode, "\tavg t:", avg_time / print_episode, "\tavg score:", avg_score / print_episode) avg_time = 0 avg_score = 0
tail = TAIL, food_count = FOOD_COUNT, obstacle_count = OBSTACLE_COUNT, multiplier_count = 0, action_space = 5, map_path = MAP_PATH) brain = Agent(gamma = 0.99, epsilon = start_eps, alpha = 0.001, maxMemorySize = 5000, replace = None) # env.play() if RENDER: env.prerender() while brain.memCntr < brain.memSize: obs, _ = env.reset() observation = env.state_vector_3D() done = False while not done: action = env.sample_action() observation_, reward, done, info = env.step(action) observation_ = env.state_vector_3D() if done: reward = -1 brain.storeTransition(observation, action, reward, observation_) observation = observation_ print("Done initialising memory") scores = [] epsHistory = [] numGames = 100000