def run2(): # Testing print("Running the Linear Function Q-Learning Model from tf.Saver()") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = True # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 100, max_time = 20, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values = createModel(x) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) error = tf.reduce_max(tf.square(Q_values - y), axis=1) # error = tf.square(tf.subtract(y, Q_values)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 100 total_episodes = 10000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Tensorboard capabilties # writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) # Different model restore method # new_saver = tf.train.import_meta_graph('my-model.meta') # new_saver.restore(sess, tf.train.latest_checkpoint('./')) print("Model restored.") sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state if reward == 100: print("reached food") # Gathering our now current states action-value vector # new_state_vector = env.state_vector() # y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training # maxq = sess.run(y_prime_max, feed_dict={y:y_prime}) # Q_vector[:,action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector}) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) # DEBUGGING # print("action:",action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if episode % print_episode == 0 and episode != 0: # print("Episode:", episode, " Score:", info["score"]) print("Episode:", episode, "\ttime:", avg_time/print_episode, "\tscore:", avg_score/print_episode, "\tError", avg_error/print_episode) # print("error tensor:", e) avg_time = 0 avg_score = 0 avg_error = 0
def trainDeepModel(load = False): print("\n ---- Training the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = False # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 80, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action epsilon_function = True epsilon_start = 0.5 epsilon_end = 0.05 epsilon_percentage = 0.5 # in decimal alpha_function = False alpha_start = 0.01 alpha_end = 0.003 alpha_percentage = 0.9 # in decimal # Create NN model with tf.name_scope('Model'): Q_values, hidden_1_layer, hidden_2_layer, output_layer = createDeepModel(x, load_variables = load) # Error / Loss function # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value with tf.name_scope('Error'): # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # Doesn't work! # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) tf.summary.scalar('error', tf.squeeze(error)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 # error plot # errors = [] print_episode = 1000 total_episodes = 100000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False # Linear function for alpha if alpha_function: alpha = (-alpha_start / (alpha_percentage*total_episodes)) * episode + (alpha_start+alpha_end) if alpha < alpha_end: alpha = alpha_end # Linear function for epsilon if epsilon_function: epsilon = (-epsilon_start / (epsilon_percentage*total_episodes)) * episode + (epsilon_start+epsilon_end) if epsilon < epsilon_end: epsilon = epsilon_end while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector", Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state # if final state of the episode if done: Q_vector[:,action] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector new_state_vector = env.state_vector() y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL Equation Q_vector[:,action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector}) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) # DEBUGGING # print("action:", action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!! # errors.append(e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes-1): print("Ep:", episode, "\tavg t:", avg_time/print_episode, "\tavg score:", avg_score/print_episode, "\tErr", round(avg_error/print_episode,3), "\tepsilon", round(epsilon,2)) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to text files w1 = np.array(sess.run(hidden_1_layer['weights'])) b1 = np.array(sess.run(hidden_1_layer['biases'])) w2 = np.array(sess.run(hidden_2_layer['weights'])) b2 = np.array(sess.run(hidden_2_layer['biases'])) w3 = np.array(sess.run(output_layer['weights'])) b3 = np.array(sess.run(output_layer['biases'])) np.savetxt(W1_textfile_path_save, w1.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B1_textfile_path_save, b1.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(W2_textfile_path_save, w2.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B2_textfile_path_save, b2.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(W3_textfile_path_save, w3.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B3_textfile_path_save, b3.astype(np.float), fmt='%f', delimiter = " ") s = sess.run(merged_summary, feed_dict={x: state_vector, y: Q_vector}) writer.add_summary(s, episode) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
def run(): # Testing print("\n ----- Running the Linear Function Q-Learning Model ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 100, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() epsilon = 0.01 # Probability to choose random action instead of best action # Create NN model Q_values, output_layer, hidden_1_layer = recreateModel(x) action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 got_food = 0 print_episode = 10 total_episodes = 100 # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print(Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # action is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y:Q_vector}) # action = sess.run(tf.argmax(Q_vector, axis=1)) # action = np.argmax(Q[env.state_index(state)]) # Update environment with by performing action new_state, reward, done, info = env.step(action) # Q[env.state_index(state), action] += alpha * (reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action]) state = new_state if reward == 100: got_food += 1 if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: # print("Episode:", episode, " Score:", info["score"]) print("Episode:", episode, " time:", avg_time/print_episode, " score:", avg_score/print_episode, " Got food", got_food, "times") avg_time = 0 avg_score = 0
def runDeepModel(): # Testing print("\n ---- Running the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 50, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.01 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values, hidden_1_layer, hidden_2_layer, output_layer = createDeepModel(x, load_variables = True) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 10 total_episodes = 100 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: print("Ep:", episode, " avg t:", avg_time/print_episode, " avg score:", avg_score/print_episode) avg_time = 0 avg_score = 0