def train(): print() print("RUNNING THE MINECRAFT SIMULATION") print() RENDER = False # RENDER = True LOAD_MODEL = False # LOAD_MODEL = True start_eps = 0.8 WRAP = False GRID_SIZE = 7 LOCAL_GRID_SIZE = 9 # Has to be an odd number (I think...) SEED = 1 FOOD_COUNT = 1 OBSTACLE_COUNT = 0 # MAP_PATH = "./Maps/Grid{}/map2.txt".format(GRID_SIZE) MAP_PATH = None env = Environment(wrap=WRAP, grid_size=GRID_SIZE, rate=80, max_time=30, food_count=FOOD_COUNT, obstacle_count=OBSTACLE_COUNT, lava_count=0, zombie_count=0, action_space=5, map_path=MAP_PATH) brain = Agent(gamma=0.99, epsilon=start_eps, alpha=0.01, maxMemorySize=10000, replace=10) if LOAD_MODEL: try: path = "./Models/Torch/my_model.pth" brain.load_model(path) print("Model loaded from path:", path) print() brain.EPSILON = 0.05 except Exception: print('Could not load model') print('Press <ENTER> to continue with random initialision') print() input() # quit() if RENDER: env.prerender() games_played = 0 print("INITIALISING REPLAY MEMORY") while brain.memCntr < brain.memSize: observation, _ = env.reset() # print(observation) # observation = env.local_state_vector_3D() done = False if RENDER: env.render() # Render first screen while not done: action = brain.chooseAction(observation) observation_, reward, done, info = env.step(action) # observation_ = env.local_state_vector_3D() # print(observation_) if done: # reward = -1 games_played += 1 brain.storeTransition(observation, action, reward, done, observation_) observation = observation_ if RENDER: env.render() print("Done initialising replay memory. Played {} games".format( games_played)) scores = [] epsHistory = [] numGames = 100000 print_episode = 100 batch_size = 16 avg_score = 0 avg_time = 0 avg_loss = 0 print() print("TRAINING MODEL") print() for i in range(numGames): epsHistory.append(brain.EPSILON) done = False observation, _ = env.reset() # observation = env.local_state_vector_3D() score = 0 lastAction = 0 if RENDER: env.render() # Render first screen while not done: action = brain.chooseAction(observation) observation_, reward, done, info = env.step(action) # observation_ = env.local_state_vector_3D() # score += reward # print(observation_) brain.storeTransition(observation, action, reward, done, observation_) observation = observation_ loss = brain.learn(batch_size) lastAction = action if RENDER: env.render() avg_score += info["score"] avg_time += info["time"] avg_loss += loss.item() if i % print_episode == 0 and not i == 0 or i == numGames - 1: print("Episode", i, "\tavg time: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tavg loss: {0:.3f}".format(avg_loss / print_episode), "\tepsilon: %.4f" % brain.EPSILON) brain.save_model("./Models/Torch/my_model{}.pth".format(i)) avg_loss = 0 avg_score = 0 avg_time = 0 # scores.append(score) # print("score:", score) brain.save_model("./Models/Torch/my_model.pth")
def run(): MODEL_NAME = "explore15_input6" FOLDER = "Best_Dojos" MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" USE_SAVED_MODEL_FILE = False GRID_SIZE = 32 LOCAL_GRID_SIZE = 15 MAP_NUMBER = 0 RANDOMIZE_MAPS = False # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None MAP_PATH = "./Maps/Grid{}/impossible_map1.txt".format(GRID_SIZE, MAP_NUMBER) print("\n ---- Running the Deep Q Network ----- \n") RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 60, food_count = 0, obstacle_count = 0, lava_count = 0, zombie_count = 0, history = 40, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/", trainable = False) brain = Brain(epsilon=0.0, action_space = env.number_of_actions()) model.setup(brain) avg_time = 0 avg_score = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 100 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, np.random.randint(10)) env.set_map(MAP_PATH) # state, info = env.reset() state, info = env.quick_reset() done = False if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="\n") avg_time = 0 avg_score = 0 avg_reward = 0
def run_MetaNetwork(): print("\n ---- Running the Meta Network ----- \n") MODEL_NAME = "meta15_input6_4_unfrozen" DIAMOND_MODEL_NAME = "diamond15_input6_best_unfrozen4_300k" ZOMBIE_MODEL_NAME = "zombie15_input6_best_unfrozen4_300k" EXPLORE_MODEL_NAME = "explore15_input6_best_unfrozen4_300k" MODEL_PATH_SAVE = "./Models/Tensorflow/Meta/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" LOGDIR = "./Logs/"+MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 10 LOCAL_GRID_SIZE = 15 MAP_PATH = None RANDOMIZE_MAPS = True RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 100, food_count = 10, obstacle_count = 0, lava_count = 0, zombie_count = 2, history = 40, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/Best_Meta/", load=True, trainable = False) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) brain = Brain(epsilon=0.0, action_space=3) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) explore_net.setup(brain) avg_time = 0 avg_score = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 100 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid10/map{}.txt".format(np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False if RENDER_TO_SCREEN: env.render() while not done: dojo = brain.choose_action(state, sess, model) # print(dojo) if dojo == 0: dojo_state = state # dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.0) elif dojo == 1: dojo_state = state # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.0) elif dojo == 2: dojo_state = state # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.0) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): current_time = math.floor(time.time()-start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_reward = 0
MAP_NUMBER = np.random.randint(5) # MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = "./Maps/Grid{}/impossible_map_empty{}.txt".format(GRID_SIZE, np.random.randint(5)) >>>>>>> bb65e67a21ae21b44fe40601099140a06be937b5 env.set_map(MAP_PATH) state, info = env.reset() # state, info = env.quick_reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=1.0, end=0.05, percentage=0.5) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state[3], reward) brain.store_transition(state, action, reward, done, new_state) e, Q_vector = brain.train(model, sess) state = new_state
def train_MetaNetwork(): print("\n ---- Training the Meta Network ----- \n") MODEL_NAME = "meta_grid16_zero_2" MODEL_NAME_save = "meta_grid16_zero_2" DIAMOND_MODEL_NAME = "diamond_grid16_4" ZOMBIE_MODEL_NAME = "zombie_grid16_2" EXPLORE_MODEL_NAME = "explore_grid16_2" # EXTRA_MODEL_NAME = "extra15_input6_2" # MODEL_NAME = "meta15_input6_1M_unfrozen_dojos" # DIAMOND_MODEL_NAME = "diamond15_input4_best_unfrozen_at_1M" # ZOMBIE_MODEL_NAME = "zombie15_input4_best_unfrozen_at_1M" # EXPLORE_MODEL_NAME = "explore15_input4_best_unfrozen_at_1M" # MODEL_NAME = "meta15_input6_1M_random_unfrozen_cointoss" # DIAMOND_MODEL_NAME = "diamond15_input4_1M_random_unfrozen_cointoss" # ZOMBIE_MODEL_NAME = "zombie15_input4_1M_random_unfrozen_cointoss" # EXPLORE_MODEL_NAME = "explore15_input4_1M_random_unfrozen_cointoss"k FOLDER = "Impossible" DOJO_FOLDER = "Impossible" MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" LOGDIR = "./Logs/"+FOLDER+"/"+MODEL_NAME_save+"" USE_SAVED_MODEL_FILE = False GRID_SIZE = 16 LOCAL_GRID_SIZE = 15 MAP_PATH = None RANDOMIZE_MAPS = True RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 120, food_count = 0, obstacle_count = 0, lava_count = 0, zombie_count = 0, history = 100, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/"+FOLDER+"/", load=False, trainable=True) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) # extra_net = Network(local_size=LOCAL_GRID_SIZE, name=EXTRA_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=False, trainable=True) brain = Brain(epsilon=0.05, action_space=3) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) explore_net.setup(brain) # extra_net.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Histogram histogram = Histogram(3, 10, total_episodes) # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, np.random.randint(5)) env.set_map(MAP_PATH) # state, info = env.reset() state, info = env.quick_reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=1.0, end=0.05, percentage=0.5) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state}) dojo = brain.choose_action(state, sess, model) histogram.check_section(episode) histogram.add(dojo) # dojo = np.random.randint(3) # dojo = 0 # print(dojo) if dojo == 0: dojo_state = state # dojo_state[2]=0 # dojo_state[3]=0 # dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.05) elif dojo == 1: dojo_state = state # dojo_state[1]=0 # dojo_state[3]=0 # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.05) elif dojo == 2: dojo_state = state # dojo_state[1]=0 # dojo_state[2]=0 # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.05) # elif dojo == 3: # dojo_state = state # action = brain.choose_dojo(dojo_state, sess, extra_net, env.number_of_actions(), 0.05) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition_dojo(state, action, reward, done, new_state, dojo) # print(tf.trainable_variables(scope=None)) # if dojo == 0: # e, Q_vector = brain.train_3_dojos(diamond_net, sess, dojo) # elif dojo == 1: # e, Q_vector = brain.train_3_dojos(zombie_net, sess, dojo) # elif dojo == 2: # e, Q_vector = brain.train_3_dojos(explore_net, sess, dojo) # e, Q_vector = brain.train_3(sess, diamond_net, zombie_net, explore_net) # e, Q_vector = brain.train(extra_net, sess) if done: Dojo_vector[:,dojo] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector y_prime = sess.run(model.q_values, feed_dict={model.input: new_state}) # Equation for training maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime}) # RL Equation Dojo_vector[:,dojo] = reward + (brain.GAMMA * maxq) _, e = sess.run([model.optimizer, model.error], feed_dict={model.input: state, model.actions: Dojo_vector}) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): current_time = math.floor(time.time()-start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tErr {0:.3f}".format(avg_error/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # diamond_net.save(sess, name=DIAMOND_MODEL_NAME+"") # zombie_net.save(sess, name=ZOMBIE_MODEL_NAME+"") # explore_net.save(sess, name=EXPLORE_MODEL_NAME+"") # extra_net.save(sess, name=EXTRA_MODEL_NAME+"") # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Dojo_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode}) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # diamond_net.save(sess, verbose=True, name=DIAMOND_MODEL_NAME+"") # zombie_net.save(sess, verbose=True, name=ZOMBIE_MODEL_NAME+"") # explore_net.save(sess, verbose=True, name=EXPLORE_MODEL_NAME+"") # extra_net.save(sess, verbose=True, name=EXTRA_MODEL_NAME+"") # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path) writer.close() histogram.plot()
def train(): MODEL_NAME = "diamond9_input5" MODEL_NAME_save = "diamond9_input5" FOLDER = "Best_Dojos9" MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt" LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2" USE_SAVED_MODEL_FILE = False GRID_SIZE = 8 LOCAL_GRID_SIZE = 9 MAP_NUMBER = 0 RANDOMIZE_MAPS = False # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=10, obstacle_count=0, lava_count=0, zombie_count=0, history=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/" + FOLDER + "/") brain = Brain(epsilon=0.1, action_space=env.number_of_actions()) model.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 10000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) # for episode in range(50): # state, info = env.reset() # done = False # if RENDER_TO_SCREEN: # env.render() # while not done: # action = brain.choose_action(state, sess, model) # new_state, reward, done, info = env.step(action) # brain.store_transition(state, action, reward, done, new_state) # state = new_state # if RENDER_TO_SCREEN: # env.render() # print("\nREPLAY MEMORY INITIALISED") # print(brain.memCntr) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) # e, Q_vector = brain.train_batch(4, model, sess) e, Q_vector = brain.train(model, sess) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e avg_reward += cumulative_reward cumulative_reward = 0 if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tavg_reward {0:.3f}".format( avg_reward / print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={ model.input: state, model.actions: Q_vector, score: avg_score / print_episode, avg_t: avg_time / print_episode, epsilon: brain.EPSILON, avg_r: avg_reward / print_episode }) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path) writer.close()
def train(): MODEL_NAME = "diamond_local15_maps" MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt" LOGDIR = "./Logs/" + MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 10 LOCAL_GRID_SIZE = 15 MAP_NUMBER = 0 RANDOMIZE_MAPS = True # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=3, obstacle_count=1, lava_count=1, zombie_count=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/Maps/") brain = Brain(epsilon=0.05, action_space=env.number_of_actions()) model.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties # writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") sess.run(init) # writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False brain.linear_epsilon_decay(total_episodes, episode, start=0.5, end=0.05, percentage=0.6) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form # Q_vector = sess.run(model.q_values, feed_dict={model.input: state}) action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) e = brain.train(model, sess) state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npz file model.save(sess) save_path = saver.save(sess, MODEL_PATH_SAVE) # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector}) # writer.add_summary(s, episode) model.save(sess, verbose=True) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
def train_MetaNetwork(): print("\n ---- Training the Meta Network ----- \n") MODEL_NAME = "meta_network_local15" DIAMOND_MODEL_NAME = "diamond_dojo_local15" ZOMBIE_MODEL_NAME = "zombie_dojo_local15" # EXPLORE_MODEL_NAME = "explore_dojo_local15" MODEL_PATH_SAVE = "./Models/Tensorflow/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt" LOGDIR = "./Logs/" + MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 8 LOCAL_GRID_SIZE = 15 MAP_PATH = None RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=200, food_count=3, obstacle_count=0, lava_count=0, zombie_count=1, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, load=True, trainable=False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, load=True, trainable=False) # explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, load=True, trainable = False) brain = Brain(epsilon=0.01, action_space=2) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) # explore_net.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): state, info = env.reset() done = False brain.linear_epsilon_decay(total_episodes, episode, start=0.3, end=0.02, percentage=0.5) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state}) dojo = brain.choose_action(state, sess, model) # print(dojo) if dojo == 0: # state[2] = 0 # Zero out the zombies layer state = np.delete(state, 2, 0) # Take out the zombie layer state = np.delete(state, 5, 0) # Take out the history layer action = brain.choose_dojo(state, sess, diamond_net, env.number_of_actions(), 0.01) elif dojo == 1: # state[1] = 0 # Zero out the diamond layer state = np.delete(state, 1, 0) # Take out the diamond layer state = np.delete(state, 5, 0) # Take out the history layer action = brain.choose_dojo(state, sess, zombie_net, env.number_of_actions(), 0.01) elif dojo == 2: state = np.delete(state, 1, 0) # Take out the diamond layer state = np.delete(state, 2, 0) # Take out the zombie layer action = brain.choose_dojo(state, sess, explore_net, env.number_of_actions(), 0.01) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, dojo, reward, done, new_state) ## Standard training with learning after every step # print(tf.trainable_variables(scope=None)) if done: Dojo_vector[:, dojo] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector y_prime = sess.run(model.q_values, feed_dict={model.input: new_state}) # Equation for training maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime}) # RL Equation Dojo_vector[:, dojo] = reward + (brain.GAMMA * maxq) _, e = sess.run([model.optimizer, model.error], feed_dict={ model.input: state, model.actions: Dojo_vector }) ## Training using replay memory state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npz file model.save(sess) save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={ model.input: state, model.actions: Dojo_vector }) writer.add_summary(s, episode) model.save(sess, verbose=True) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path) writer.close()
def runDeepModel(): # Testing print("\n ---- Running the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap=WRAP, grid_size=GRID_SIZE, rate=80, max_time=200, food_count=FOOD_COUNT, obstacle_count=OBSTACLE_COUNT, zombie_count=0, action_space=5, map_path=None) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.01 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.001 # Probability to choose random action instead of best action # Create NN model with tf.name_scope('Model'): Q_values, weights, biases = createDeepModel(x, load_variables=True) # Error / Loss function # Not sure why its reduce_mean, it reduces the [1,4] tensor to a scalar of the mean value with tf.name_scope('Error'): # e1 = tf.subtract(y, Q_values) # e2 = tf.square(e1) # error = tf.reduce_mean(e2, axis=1) # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.sqrt(tf.square(tf.subtract(Q_values, y))), axis=1) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 print_episode = 1 total_episodes = 10 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") sess.run(init) # Testing my DQN model with random values for episode in range(total_episodes): state, info = env.reset() done = False while not done: if RENDER_TO_SCREEN: env.render() # One Hot representation of the current state state_vector = env.local_state_vector_3D() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state_vector}) # print("Qvector",Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(reward) state = new_state if done: avg_time += info["time"] avg_score += info["score"] if episode % print_episode == 0 and episode != 0: print("Ep:", episode, "\tavg t:", avg_time / print_episode, "\tavg score:", avg_score / print_episode) avg_time = 0 avg_score = 0
def trainDeepModel(load=False): # Used to see how long model takes to train - model needs to be optimized! start_time = time.time() print("\n ---- Training the Deep Neural Network ----- \n") # Decide whether or not to render to the screen or not RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True # True - Load model from modelpath_load; False - Initialise random weights USE_SAVED_MODEL_FILE = False # First we need our environment form Environment_for_DQN.py # has to have a grid_size of 10 for this current NN env = Environment(wrap=WRAP, grid_size=5, rate=80, max_time=30, food_count=3, obstacle_count=0, lava_count=0, zombie_count=0, action_space=5, map_path=None) if RENDER_TO_SCREEN: env.prerender() # Hyper-parameters alpha = 0.001 # Learning rate, i.e. which fraction of the Q values should be updated gamma = 0.99 # Discount factor, i.e. to which extent the algorithm considers possible future rewards epsilon = 0.01 # Probability to choose random action instead of best action epsilon_function = True epsilon_start = 0.5 epsilon_end = 0.05 epsilon_percentage = 0.5 # in decimal alpha_function = False alpha_start = 0.01 alpha_end = 0.003 alpha_percentage = 0.9 # in decimal # Trajectory tau = [] # Create NN model with tf.name_scope('Model'): Q_values, weights, biases = createDeepModel(x, load_variables=load) # Error / Loss function # reduce_max -> it reduces the [1,4] tensor to a scalar of the max value with tf.name_scope('Error'): # test error = tf.losses.mean_squared_error(labels=Q_values, predictions=y) # error = tf.reduce_max(tf.square(tf.subtract(Q_values, y)), axis=1) # error = tf.reduce_max(tf.square(Q_values - y), axis=1) tf.summary.scalar('error', tf.squeeze(error)) # Gradient descent optimizer - minimizes error/loss function with tf.name_scope('Optimizer'): optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(error) # optimizer = tf.train.AdamOptimizer(alpha).minimize(error) # The next states action-value [1,4] tensor, reduced to a scalar of the max value with tf.name_scope('Max_y_prime'): y_prime_max = tf.reduce_max(y, axis=1) # Action at time t, the index of the max value in the action-value tensor (Made a global variable) with tf.name_scope('Max_action'): action_t = tf.argmax(y, axis=1) avg_time = 0 avg_score = 0 avg_error = 0 # error plot # errors = [] print_episode = 1000 total_episodes = 100000 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Session can start running with tf.Session() as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_LOAD) print("Model restored.") # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) print("\nProgram took {0:.4f} seconds to initialise\n".format( time.time() - start_time)) start_time = time.time() # Training for episode in range(total_episodes): state, info = env.reset() done = False # Linear function for alpha if alpha_function: alpha = (-alpha_start / (alpha_percentage * total_episodes)) * episode + ( alpha_start + alpha_end) if alpha < alpha_end: alpha = alpha_end # Linear function for epsilon if epsilon_function: epsilon = (-(epsilon_start - epsilon_end) / (epsilon_percentage * total_episodes)) * episode + ( epsilon_start) if epsilon < epsilon_end: epsilon = epsilon_end if RENDER_TO_SCREEN: env.render() while not done: # if RENDER_TO_SCREEN: # env.render() # Retrieve the Q values from the NN in vector form Q_vector = sess.run(Q_values, feed_dict={x: state}) # print("Qvector", Q_vector) # DEBUGGING # Deciding one which action to take if np.random.rand() <= epsilon: action = env.sample_action() else: # "action" is the max value of the Q values (output vector of NN) action = sess.run(action_t, feed_dict={y: Q_vector}) # Update environment with by performing action new_state, reward, done, info = env.step(action) #''' ## Standard training with learning after every step # Q_vector = sess.run(Q_values, feed_dict={x: state}) # if final state of the episode # print("Q_vector:", Q_vector) if done: Q_vector[:, action] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector y_prime = sess.run(Q_values, feed_dict={x: new_state}) # Equation for training maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL Equation Q_vector[:, action] = reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={ x: state, y: Q_vector }) #''' ''' ## Training using replay memory # Update trajectory (Update replay memory) if len(tau) < REPLAY_MEMORY: tau.append(Trajectory(state, action, reward, new_state, done)) else: # print("tau is now full") tau.pop(0) tau.append(Trajectory(state, action, reward, new_state, done)) # Choose a random step from the replay memory random_tau = np.random.randint(0, len(tau)) # Get the Q vector of the training step Q_vector = sess.run(Q_values, feed_dict={x: tau[random_tau].state}) # If terminating state of episode if tau[random_tau].done: # Set the chosen action's current value to the reward value Q_vector[:,tau[random_tau].action] = tau[random_tau].reward else: # Gets the Q vector of the new state y_prime = sess.run(Q_values, feed_dict={x: tau[random_tau].new_state}) # Getting the best action value maxq = sess.run(y_prime_max, feed_dict={y: y_prime}) # RL DQN Training Equation Q_vector[:,tau[random_tau].action] = tau[random_tau].reward + (gamma * maxq) _, e = sess.run([optimizer, error], feed_dict={x: tau[random_tau].state, y: Q_vector}) ''' # Add to the error list, to show the plot at the end of training - RAM OVERLOAD!!! # errors.append(e) state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(epsilon), #"\ttime {0:.0f}:{1:.0f}".format(current_time/60, current_time%60), end="") if current_time % 60 < 10: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:0{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: if math.floor((current_time / 60) % 60) < 10: print("\ttime {0:.0f}:0{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) else: print("\ttime {0:.0f}:{1:.0f}:{2:.0f}".format( math.floor((current_time / 60) / 60), math.floor((current_time / 60) % 60), current_time % 60)) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npy files (can't save 4D array to text file) W_conv1 = np.array(sess.run(weights['W_conv1'])) W_conv2 = np.array(sess.run(weights['W_conv2'])) W_fc = np.array(sess.run(weights['W_fc'])) W_out = np.array(sess.run(weights['W_out'])) b_conv1 = np.array(sess.run(biases['b_conv1'])) b_conv2 = np.array(sess.run(biases['b_conv2'])) b_fc = np.array(sess.run(biases['b_fc'])) b_out = np.array(sess.run(biases['b_out'])) np.save(W_conv1_textfile_path_save, W_conv1.astype(np.float32)) np.save(W_conv2_textfile_path_save, W_conv2.astype(np.float32)) np.save(W_fc_textfile_path_save, W_fc.astype(np.float32)) np.save(W_out_textfile_path_save, W_out.astype(np.float32)) np.save(b_conv1_textfile_path_save, b_conv1.astype(np.float32)) np.save(b_conv2_textfile_path_save, b_conv2.astype(np.float32)) np.save(b_fc_textfile_path_save, b_fc.astype(np.float32)) np.save(b_out_textfile_path_save, b_out.astype(np.float32)) s = sess.run(merged_summary, feed_dict={x: state, y: Q_vector}) writer.add_summary(s, episode) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)