def run(): ''' ''' RENDER_TO_SCREEN = True # Setting up the environment env = Environment(wrap=False, grid_size=GRID_SIZE, rate=80, max_time=100, tail=False, food_count=1, obstacle_count=0, multiplier_count=0, map_path=None, action_space=5) #sets up the environment if RENDER_TO_SCREEN: env.prerender() Q = Qmatrix(2, env) # 0 - zeros, 1 - random, 2 - textfile # Minimise the overfitting during testing epsilon = 0.005 # Testing for a certain amount of episodes for episode in range(10): state, info = env.reset() done = False #if epsidoe is in the range of 10 it resets the environment unfo while not done: if RENDER_TO_SCREEN: env.render() if np.random.rand() <= epsilon: action = env.sample_action( ) #if a random numpy is less than or = to epsilon then it does an action else: action = np.argmax(Q[env.state_index( state)]) #else it does a different action new_state, reward, done, info = env.step(action) # Q[env.state_index(state), action] += alpha * (reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action]) state = new_state #gives state the value of new_state if episode % 1 == 0: print("Episode:", episode, "\tScore:", info["score"], "\tTime:", info["time"]) #prints out episode, score and time
def run2(): # Testing print("Running the Linear Function Q-Learning Model from tf.Saver()") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = True # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 100, max_time = 20, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values = createModel(x) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) error = tf.reduce_max(tf.square(Q_values - y), axis=1) # error = tf.square(tf.subtract(y, Q_values)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 100 total_episodes = 10000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Tensorboard capabilties # writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) # Different model restore method # new_saver = tf.train.import_meta_graph('my-model.meta') # new_saver.restore(sess, tf.train.latest_checkpoint('./')) print("Model restored.") sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state if reward == 100: print("reached food") # Gathering our now current states action-value vector # new_state_vector = env.state_vector() # y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training # maxq = sess.run(y_prime_max, feed_dict={y:y_prime}) # Q_vector[:,action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector}) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) # DEBUGGING # print("action:",action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if episode % print_episode == 0 and episode != 0: # print("Episode:", episode, " Score:", info["score"]) print("Episode:", episode, "\ttime:", avg_time/print_episode, "\tscore:", avg_score/print_episode, "\tError", avg_error/print_episode) # print("error tensor:", e) avg_time = 0 avg_score = 0 avg_error = 0
def run(): # Testing print("\n ----- Running the Linear Function Q-Learning Model ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 100, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() epsilon = 0.01 # Probability to choose random action instead of best action # Create NN model Q_values, output_layer, hidden_1_layer = recreateModel(x) action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 got_food = 0 print_episode = 10 total_episodes = 100 # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print(Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # action is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y:Q_vector}) # action = sess.run(tf.argmax(Q_vector, axis=1)) # action = np.argmax(Q[env.state_index(state)]) # Update environment with by performing action new_state, reward, done, info = env.step(action) # Q[env.state_index(state), action] += alpha * (reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action]) state = new_state if reward == 100: got_food += 1 if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: # print("Episode:", episode, " Score:", info["score"]) print("Episode:", episode, " time:", avg_time/print_episode, " score:", avg_score/print_episode, " Got food", got_food, "times") avg_time = 0 avg_score = 0
def trainDeepModel(load = False): print("\n ---- Training the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = False # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 80, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action epsilon_function = True epsilon_start = 0.5 epsilon_end = 0.05 epsilon_percentage = 0.5 # in decimal alpha_function = False alpha_start = 0.01 alpha_end = 0.003 alpha_percentage = 0.9 # in decimal # Create NN model with tf.name_scope('Model'): Q_values, hidden_1_layer, hidden_2_layer, output_layer = createDeepModel(x, load_variables = load) # Error / Loss function # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value with tf.name_scope('Error'): # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # Doesn't work! # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) tf.summary.scalar('error', tf.squeeze(error)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 # error plot # errors = [] print_episode = 1000 total_episodes = 100000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False # Linear function for alpha if alpha_function: alpha = (-alpha_start / (alpha_percentage*total_episodes)) * episode + (alpha_start+alpha_end) if alpha < alpha_end: alpha = alpha_end # Linear function for epsilon if epsilon_function: epsilon = (-epsilon_start / (epsilon_percentage*total_episodes)) * episode + (epsilon_start+epsilon_end) if epsilon < epsilon_end: epsilon = epsilon_end while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector", Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state # if final state of the episode if done: Q_vector[:,action] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector new_state_vector = env.state_vector() y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL Equation Q_vector[:,action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector}) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) # DEBUGGING # print("action:", action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!! # errors.append(e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes-1): print("Ep:", episode, "\tavg t:", avg_time/print_episode, "\tavg score:", avg_score/print_episode, "\tErr", round(avg_error/print_episode,3), "\tepsilon", round(epsilon,2)) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to text files w1 = np.array(sess.run(hidden_1_layer['weights'])) b1 = np.array(sess.run(hidden_1_layer['biases'])) w2 = np.array(sess.run(hidden_2_layer['weights'])) b2 = np.array(sess.run(hidden_2_layer['biases'])) w3 = np.array(sess.run(output_layer['weights'])) b3 = np.array(sess.run(output_layer['biases'])) np.savetxt(W1_textfile_path_save, w1.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B1_textfile_path_save, b1.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(W2_textfile_path_save, w2.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B2_textfile_path_save, b2.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(W3_textfile_path_save, w3.astype(np.float), fmt='%f', delimiter = " ") np.savetxt(B3_textfile_path_save, b3.astype(np.float), fmt='%f', delimiter = " ") s = sess.run(merged_summary, feed_dict={x: state_vector, y: Q_vector}) writer.add_summary(s, episode) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
def runDeepModel(): # Testing print("\n ---- Running the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap = WRAP, grid_size = GRID_SIZE, rate = 50, max_time = 100, tail = TAIL, action_space = 4) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.01 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values, hidden_1_layer, hidden_2_layer, output_layer = createDeepModel(x, load_variables = True) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 10 total_episodes = 100 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) model = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") sess.run(model) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.state_vector() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) state = new_state if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: print("Ep:", episode, " avg t:", avg_time/print_episode, " avg score:", avg_score/print_episode) avg_time = 0 avg_score = 0
def train(): ''' Starts a function called Train ''' RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True # Setting up the environment env = Environment(wrap=False, grid_size=GRID_SIZE, rate=80, max_time=100, tail=False, food_count=1, obstacle_count=0, multiplier_count=0, map_path=None, action_space=5) ''' Sets the state of environemnt to equal the above grid size given, sets the speed the snake moves, the max time it runs for, if there is a tail or not, the amount of food spawned, the amount of obstacles spaned, if there is a specific path to be taken. ''' if RENDER_TO_SCREEN: env.prerender() Q = Qmatrix(1, env) # 0 - zeros, 1 - random, 2 - textfile alpha = 0.15 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action ''' Sets variables with values because of reasons stated above ''' epsilon_function = True epsilon_start = 0.8 epsilon_end = 0.05 epsilon_percentage = 0.6 # in decimal avg_time = 0 avg_score = 0 print_episode = 1000 total_episodes = 10000 ''' Sets values to variables ''' for episode in range( total_episodes ): # Takes an episode and if it is in range of the total episodes it proceeds # Reset the environment state, info = env.reset() # Resets environment state done = False # Epsilon linear function if epsilon_function: epsilon = ( -(epsilon_start - epsilon_end) / (epsilon_percentage * total_episodes) ) * episode + ( epsilon_start ) # minuses ep_start from ep_end dived by ep_percentage times by total_episodes then timesed by epsidoe the added to ep_start if epsilon < epsilon_end: epsilon = epsilon_end #checks to see if ep is less than ep_end and if it is it makes ep = to ep_end while not done: # If cancelled, Q lookup table is still saved try: if RENDER_TO_SCREEN: env.render() if np.random.rand() <= epsilon: action = env.sample_action() else: action = np.argmax(Q[env.state_index(state)]) #checks if new random no. is less than or = to ep, else it does a new action. new_state, reward, done, info = env.step(action) # print(state) Q[env.state_index(state), action] += alpha * ( reward + gamma * np.max(Q[env.state_index(new_state)]) - Q[env.state_index(state), action]) state = new_state #assigns new value to state if done: avg_time += info["time"] avg_score += info[ "score"] # adds time nd score to the score counter and prints it out except KeyboardInterrupt as e: # Test to see if I can write the Q file during runtime np.savetxt(Q_textfile_path_save, Q.astype(np.float), fmt='%f', delimiter=" ") print("Saved Q matrix to text file") raise e #try and except work togther, this ecept is anyresponse that doesnt fall into the try section. if (episode % print_episode == 0 and episode != 0) or ( episode == total_episodes - 1 ): #tests to see if ep mod print_ep = 0 or if ep == total_ep-1, then if it does it proceeds print("Episode:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tepsilon {0:.3f}".format( epsilon)) #prints out episodes, score, time np.savetxt(Q_textfile_path_save, Q.astype(np.float), fmt='%f', delimiter=" ") avg_time = 0 avg_score = 0 #resets time and score to 0 # This doesn't need to be here # np.savetxt(Q_textfile_path_save, Q.astype(np.float), fmt='%f', delimiter = " ") print("Simulation finished. \nSaved Q matrix to text file at:", Q_textfile_path_save)
def trainDeepModel(load=False): # Used to see how long model takes to train - model needs to be optimized! start_time = time.time() print("\n ---- Training the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap=WRAP, grid_size=GRID_SIZE, rate=0, max_time=300, tail=TAIL, food_count=FOOD_COUNT, obstacle_count=OBSTACLE_COUNT, action_space=3) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.001 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.1 # Probability to choose random action instead of best action epsilon_function = True epsilon_start = 0.1 epsilon_end = 0.05 epsilon_percentage = 0.5 # in decimal alpha_function = False alpha_start = 0.01 alpha_end = 0.003 alpha_percentage = 0.9 # in decimal # Trajectory tau = [] # Create NN model with tf.name_scope('Model'): Q_values, weights, biases = createDeepModel(x, load_variables=load) # Error / Loss function # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value with tf.name_scope('Error'): # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) tf.summary.scalar('error', tf.squeeze(error)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 # error plot # errors = [] print_episode = 1000 total_episodes = 100000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) print("\nProgram took {0:.4f} seconds to initialise\n".format( time.time() - start_time)) start_time = time.time() # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False # Linear function for alpha if alpha_function: alpha = (-alpha_start / (alpha_percentage * total_episodes)) * episode + ( alpha_start + alpha_end) if alpha < alpha_end: alpha = alpha_end # Linear function for epsilon if epsilon_function: epsilon = (-(epsilon_start - epsilon_end) / (epsilon_percentage * total_episodes)) * episode + ( epsilon_start) if epsilon < epsilon_end: epsilon = epsilon_end while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.pixels() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector", Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) # Update trajectory (Update replay memory) if len(tau) < REPLAY_MEMORY: tau.append( Traj(state_vector, action, reward, env.pixels(), done)) # print(tau[i].new_state) # i=i+1 else: tau.pop(0) tau.append( Traj(state_vector, action, reward, env.pixels(), done)) state = new_state # Choose a random step from the replay memory random_tau = random.randint(0, len(tau) - 1) # Get the Q vector of the training step Q_vector = sess.run(Q_values, feed_dict={x: tau[random_tau].state}) ''' Training using replay memory ''' # if terminating state of episode if tau[random_tau].done: # Set the chosen action's current value to the reward value Q_vector[:, tau[random_tau].action] = tau[random_tau].reward else: # Gets the Q vector of the new state y_prime = sess.run( Q_values, feed_dict={x: tau[random_tau].new_state}) # Getting the best action value maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL DQN Training Equation Q_vector[:, tau[random_tau]. action] = tau[random_tau].reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={ x: tau[random_tau].state, y: Q_vector }) ''' Standard training with learning after every step # if final state of the episode if done: Q_vector[:,action] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector new_state_vector = env.local_state_vector_3D() y_prime = sess.run(Q_values, feed_dict={x: new_state_vector}) # Equation for training maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL Equation Q_vector[:,action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: state_vector, y: Q_vector}) # _ = sess.run(optimizer, feed_dict={x: state_vector, y: Q_vector}) # e = sess.run(error,feed_dict={x:state_vector, y:Q_vector}) # sess.run(optimizer) ''' # DEBUGGING # print("action:", action) # print("y_prime:", y_prime) # print("max q value:", maxq) # print("new Q_vector:", Q_vector) # print("error tensor:", e) # add to the error list, to show the plot at the end of training - RAM OVERLOAD!!! # errors.append(e) if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = time.time() - start_time print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(epsilon), #"\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60), end="") if current_time % 60 < 10: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npy files (can't save 4D array to text file) W_conv1 = np.array(sess.run(weights['W_conv1'])) W_conv2 = np.array(sess.run(weights['W_conv2'])) W_fc = np.array(sess.run(weights['W_fc'])) W_out = np.array(sess.run(weights['W_out'])) b_conv1 = np.array(sess.run(biases['b_conv1'])) b_conv2 = np.array(sess.run(biases['b_conv2'])) b_fc = np.array(sess.run(biases['b_fc'])) b_out = np.array(sess.run(biases['b_out'])) np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32)) np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32)) np.save(W_fc_textfile_path_save, W_fc.astype(np.float32)) np.save(W_out_textfile_path_save, W_out.astype(np.float32)) np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32)) np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32)) np.save(b_fc_textfile_path_save, b_fc.astype(np.float32)) np.save(b_out_textfile_path_save, b_out.astype(np.float32)) s = sess.run(merged_summary, feed_dict={ x: state_vector, y: Q_vector }) writer.add_summary(s, episode) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
lastAction = 0 while not done: action = brain.chooseAction(observation) observation_, reward, done, info = env.step(action) observation_ = env.state_vector_3D() score += reward brain.storeTransition(observation, action, reward, observation_) observation = observation_ if TRAIN: loss = brain.learn(batch_size) lastAction = action if RENDER: env.render() avg_score += info["score"] if TRAIN: avg_loss += loss.item() if i%100 == 0 and not i==0 or i == numGames-1: print("Game", i, "\tepsilon: %.4f" %brain.EPSILON, "\tavg score", avg_score/100, "avg loss:", avg_loss/100) brain.save_model("./Models/Torch2/my_model{}.pth".format(i)) avg_loss = 0 avg_score = 0 scores.append(score)