def eval(model_type=model_type, model_path=model_path): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' env = LunarLander() if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() episodes = 50 wins = 0 frames = [] fuel_left = [] for i in range(episodes): if i % 10 == 0: print(f"On episode {i}") frame_count = 0 env.reset() state = env.get_state() while True: frame_count += 1 action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) if done: if env.won: wins += 1 frames.append(frame_count) fuel_left.append(env.rocket.fuel) break env.close() if wins > 0: print(f"wins: {wins}") print(f"mean frames on wins {np.mean(frames)}") print(f"std frames on wins {np.std(frames, ddof=1)}") print(f"min frames on wins {np.min(frames)}") print(f"max frames on wins {np.max(frames)}") print(f"mean fuel on wins {np.mean(fuel_left)}") print(f"std fuel on wins {np.std(fuel_left, ddof=1)}") print(f"min fuel on wins {np.min(fuel_left)}") print(f"max fuel on wins {np.max(fuel_left)}") else: print("The model had 0 wins. Statistics can't be calculated")
def __init__( self, env, memory_size, batch_size, target_update=100, gamma=0.99, # replay parameters alpha=0.2, beta=0.6, prior_eps=1e-6, # Categorical DQN parameters v_min=0, v_max=200, atom_size=51, # N-step Learning n_step=3, start_train=32, save_weights=True, log=True, lr=0.001, seed=0, episodes=200): self.env = env obs_dim = self.env.observation_dim action_dim = self.env.action_dim self.batch_size = batch_size self.target_update = target_update self.gamma = gamma self.lr = lr self.memory_size = memory_size self.seed = seed # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer(obs_dim, memory_size, batch_size, alpha=alpha) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer(obs_dim, memory_size, batch_size, n_step=n_step, gamma=gamma) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace(self.v_min, self.v_max, self.atom_size).to(self.device) # networks: dqn, dqn_target self.dqn = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr) # transition to store in memory self.transition = list() self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10)) self.start_train = start_train self.save_weights = save_weights self.time = datetime.datetime.now().timetuple() self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}" self.log = log self.episode_cnt = 0 self.episodes = episodes if self.save_weights is True: self.create_save_directory() plt.ion()
class DQNAgent: def __init__( self, env, memory_size, batch_size, target_update=100, gamma=0.99, # replay parameters alpha=0.2, beta=0.6, prior_eps=1e-6, # Categorical DQN parameters v_min=0, v_max=200, atom_size=51, # N-step Learning n_step=3, start_train=32, save_weights=True, log=True, lr=0.001, seed=0, episodes=200): self.env = env obs_dim = self.env.observation_dim action_dim = self.env.action_dim self.batch_size = batch_size self.target_update = target_update self.gamma = gamma self.lr = lr self.memory_size = memory_size self.seed = seed # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # memory for 1-step Learning self.beta = beta self.prior_eps = prior_eps self.memory = PrioritizedReplayBuffer(obs_dim, memory_size, batch_size, alpha=alpha) # memory for N-step Learning self.use_n_step = True if n_step > 1 else False if self.use_n_step: self.n_step = n_step self.memory_n = ReplayBuffer(obs_dim, memory_size, batch_size, n_step=n_step, gamma=gamma) # Categorical DQN parameters self.v_min = v_min self.v_max = v_max self.atom_size = atom_size self.support = torch.linspace(self.v_min, self.v_max, self.atom_size).to(self.device) # networks: dqn, dqn_target self.dqn = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target = Network(obs_dim, action_dim, self.atom_size, self.support).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() # optimizer self.optimizer = optim.Adam(self.dqn.parameters(), lr=self.lr) # transition to store in memory self.transition = list() self.fig, (self.ax1, self.ax2) = plt.subplots(2, figsize=(10, 10)) self.start_train = start_train self.save_weights = save_weights self.time = datetime.datetime.now().timetuple() self.path = f"weights/{self.time[2]}-{self.time[1]}-{self.time[0]}_{self.time[3]}-{self.time[4]}" self.log = log self.episode_cnt = 0 self.episodes = episodes if self.save_weights is True: self.create_save_directory() plt.ion() def create_save_directory(self): try: os.mkdir(self.path) except OSError: print("Creation of the directory %s failed" % self.path) else: print("Successfully created the directory %s " % self.path) def select_action(self, state): """Select an action from the input state.""" # NoisyNet: no epsilon greedy action selection selected_action = self.dqn(torch.FloatTensor(state).to( self.device)).argmax() selected_action = selected_action.detach().cpu().numpy() self.transition = [state, selected_action] return selected_action def step(self, action): """Take an action and return the response of the env.""" next_state, reward, done = self.env.step(action) self.transition += [reward, next_state, done] # N-step transition if self.use_n_step: one_step_transition = self.memory_n.store(*self.transition) # 1-step transition else: one_step_transition = self.transition # add a single step transition if one_step_transition: self.memory.store(*one_step_transition) return next_state, reward, done def update_model(self): """Update the model by gradient descent.""" # PER needs beta to calculate weights samples = self.memory.sample_batch(self.beta) weights = torch.FloatTensor(samples["weights"].reshape(-1, 1)).to( self.device) indices = samples["indices"] # 1-step Learning loss elementwise_loss = self._compute_dqn_loss(samples, self.gamma) # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) # N-step Learning loss # we are gonna combine 1-step loss and n-step loss so as to # prevent high-variance. The original rainbow employs n-step loss only. if self.use_n_step: gamma = self.gamma**self.n_step samples = self.memory_n.sample_batch_from_idxs(indices) elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma) elementwise_loss += elementwise_loss_n_loss # PER: importance sampling before average loss = torch.mean(elementwise_loss * weights) self.optimizer.zero_grad() loss.backward() # print(loss) clip_grad_norm_(self.dqn.parameters(), 10.0) self.optimizer.step() # PER: update priorities loss_for_prior = elementwise_loss.detach().cpu().numpy() new_priorities = loss_for_prior + self.prior_eps self.memory.update_priorities(indices, new_priorities) # NoisyNet: reset noise self.dqn.reset_noise() self.dqn_target.reset_noise() return loss.item() def train(self, num_frames, plotting_interval=100): """Train the agent.""" if self.log: pass # config = {'gamma': self.gamma, 'log_interval': plotting_interval, 'learning_rate': self.lr, # 'directory': self.path, 'type': 'dqn', 'replay_memory': self.memory_size, 'environment': 'normal', 'seed': self.seed} # wandb.init(project='is_os', entity='pydqn', config=config, notes=self.env.reward_function, reinit=True, tags=['report']) # wandb.watch(self.dqn) self.env.reset() state = self.env.get_state() won = False update_cnt = 0 losses = [] scores = [] score = 0 frame_cnt = 0 self.episode_cnt = 0 for frame_idx in range(1, num_frames + 1): frame_cnt += 1 action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward fraction = min(frame_cnt / num_frames, 1.0) self.beta = self.beta + fraction * (1.0 - self.beta) # if agent has trained 500 frames, terminate if frame_cnt == 500: done = True # if episode ends if done: if reward > 0: won = True self.env.reset() state = self.env.get_state() self.episode_cnt += 1 scores.append(score) score = 0 frame_cnt = 0 # if training is ready if len(self.memory) >= self.batch_size: loss = self.update_model() losses.append(loss) update_cnt += 1 # if hard update is needed if update_cnt % self.target_update == 0: self._target_hard_update() # plotting if frame_idx % plotting_interval == 0: self._plot(frame_idx, scores, losses) if frame_idx % 1000 == 0: torch.save(self.dqn.state_dict(), f'{self.path}/{frame_idx}.tar') print(f"model saved at:\n {self.path}/{frame_idx}.tar") # wandb.run.summary['won'] = won self.env.close() def _compute_dqn_loss(self, samples, gamma): """Return categorical dqn loss.""" device = self.device # for shortening the following lines state = torch.FloatTensor(samples["obs"]).to(device) next_state = torch.FloatTensor(samples["next_obs"]).to(device) action = torch.LongTensor(samples["acts"]).to(device) reward = torch.FloatTensor(samples["rews"].reshape(-1, 1)).to(device) done = torch.FloatTensor(samples["done"].reshape(-1, 1)).to(device) # Categorical DQN algorithm delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1) with torch.no_grad(): # Double DQN next_action = self.dqn(next_state).argmax(1) next_dist = self.dqn_target.dist(next_state) next_dist = next_dist[range(self.batch_size), next_action] t_z = reward + (1 - done) * gamma * self.support t_z = t_z.clamp(min=self.v_min, max=self.v_max) b = (t_z - self.v_min) / delta_z l = b.floor().long() u = b.ceil().long() offset = (torch.linspace( 0, (self.batch_size - 1) * self.atom_size, self.batch_size).long().unsqueeze(1).expand( self.batch_size, self.atom_size).to(self.device)) proj_dist = torch.zeros(next_dist.size(), device=self.device) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) dist = self.dqn.dist(state) log_p = torch.log(dist[range(self.batch_size), action]) elementwise_loss = -(proj_dist * log_p).sum(1) return elementwise_loss def _target_hard_update(self): """Hard update: target <- local.""" self.dqn_target.load_state_dict(self.dqn.state_dict()) def _plot(self, frame_cnt, scores, losses): self.ax1.cla() self.ax1.set_title( f'frames: {frame_cnt} score: {np.mean(scores[-10:])}') self.ax1.plot(scores[-999:], color='red') self.ax2.cla() self.ax2.set_title(f'loss: {np.mean(losses[-10:])}') self.ax2.plot(losses[-999:], color='blue') plt.show() plt.pause(0.1) # needed for wandb to not log nans # if frame_cnt < self.start_train + 11: # loss = 0 # else: # loss = np.mean(losses[-10:]) if self.log: pass
def runMission(train=False, load_model=False): ''' Run or train Deep Model Training method still needs to be added ''' # Global timer - multi purpose start_time = time.time() print("\n ---- Running the Deep Q Network ----- \n") USE_SAVED_MODEL_FILE = False # agent_host, my_mission, my_mission_record, action_space = setupMinecraft() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/" + FOLDER + "/", load=load_model, trainable=train) brain = Brain(epsilon=0.1, action_space=5) model.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 print_episode = 1 total_episodes = 10 # Saving model capabilities saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin Session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Restore the model, to keep training if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_CHECKPOINT) print("Model restored.") else: # Initialize global variables sess.run(init) # Tensorboard graph writer.add_graph(sess.graph) print("\nProgram took {0:.4f} seconds to initialise\n".format( time.time() - start_time)) start_time = time.time() # Running mission for episode in range(total_episodes): agent_host, my_mission, my_mission_record, action_space = setupMinecraft( ) world_state = reset(agent_host, my_mission, my_mission_record) score = 0 done = False craft_sword = False # Getting first observation while True: world_state = agent_host.getWorldState() if world_state.number_of_observations_since_last_state > 0: break msg = world_state.observations[-1].text observations = json.loads(msg) # grid = observations.get(u'floor9x9', 0) grid = observations.get(u'floor15x15', 0) score = observations.get(u'Hotbar_8_size', 0) nearby_entites = observations.get(u'nearby_entites', 0) diamonds = [] zombies = [] steve_pos = (0, 0) steve_life = 20 for entity in nearby_entites: if entity["name"] == "diamond": diamonds.append((entity["x"], entity["z"])) if entity["name"] == "steve": steve_pos = ((entity["x"], entity["z"])) steve_life = entity["life"] if entity["name"] == "Zombie": zombies.append((entity["x"], entity["z"])) state = get_state(steve_pos, diamonds, zombies, grid) # brain.linear_epsilon_decay(total_episodes, episode, start=0.3, end=0.05, percentage=0.5) world_state = agent_host.getWorldState() while world_state.is_mission_running and not done: print("-", end="") time.sleep(0.01) action = brain.choose_action(state, sess, model) # print("action:", action_space[action]) if craft_sword: agent_host.sendCommand("craft diamond_sword") done = True else: agent_host.sendCommand(action_space[action]) time.sleep(0.2) world_state = agent_host.getWorldState() for error in world_state.errors: print("Error:", error.text) # Have we received any observations? if world_state.number_of_observations_since_last_state > 0: # if world_state.number_of_observations_since_last_state > 0 and world_state.number_of_rewards_since_last_state > 0: msg = world_state.observations[-1].text observations = json.loads(msg) # print("\n\n", observations, "\n\n") grid = observations.get(u'floor15x15', 0) score = observations.get(u'Hotbar_8_size', 0) nearby_entites = observations.get(u'nearby_entites', 0) diamonds = [] zombies = [] for entity in nearby_entites: if entity["name"] == "diamond": diamonds.append((entity["x"], entity["z"])) if entity["name"] == "Steve": steve_pos = ((entity["x"], entity["z"])) steve_life = entity["life"] if entity["name"] == "Zombie": zombies.append((entity["x"], entity["z"])) # Debugging - print the state for i in range(6): print(state[i]) print() new_state = get_state(steve_pos, diamonds, zombies, grid) # reward = world_state.rewards[-1].getValue() # score += reward # brain.store_transition(state, action, reward, done, new_state) # e, Q_vector = brain.train(model, sess) state = new_state # cumulative_reward += reward # print(score) if score >= 6: craft_sword = True if steve_life != 20: done = True # if done: # avg_time += info["time"] # avg_score += info["score"] # avg_error += e # avg_reward += cumulative_reward # cumulative_reward = 0 if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\terr {0:.3f}".format(avg_error / print_episode), "\tavg_reward {0:.3f}".format( avg_reward / print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file # model.save(sess) # save_path = saver.save(sess, MODEL_PATH_SAVE) # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode}) # writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 # model.save(sess, verbose=True) # save_path = saver.save(sess, MODEL_CHECKPOINT) # print("Model saved in path: %s" % save_path) writer.close()
def run_MetaNetwork(): print("\n ---- Running the Meta Network ----- \n") MODEL_NAME = "meta15_input6_4_unfrozen" DIAMOND_MODEL_NAME = "diamond15_input6_best_unfrozen4_300k" ZOMBIE_MODEL_NAME = "zombie15_input6_best_unfrozen4_300k" EXPLORE_MODEL_NAME = "explore15_input6_best_unfrozen4_300k" MODEL_PATH_SAVE = "./Models/Tensorflow/Meta/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" LOGDIR = "./Logs/"+MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 10 LOCAL_GRID_SIZE = 15 MAP_PATH = None RANDOMIZE_MAPS = True RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 100, food_count = 10, obstacle_count = 0, lava_count = 0, zombie_count = 2, history = 40, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/Best_Meta/", load=True, trainable = False) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/Best_Dojos/", load=True, trainable = False) brain = Brain(epsilon=0.0, action_space=3) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) explore_net.setup(brain) avg_time = 0 avg_score = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 100 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid10/map{}.txt".format(np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False if RENDER_TO_SCREEN: env.render() while not done: dojo = brain.choose_action(state, sess, model) # print(dojo) if dojo == 0: dojo_state = state # dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.0) elif dojo == 1: dojo_state = state # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.0) elif dojo == 2: dojo_state = state # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.0) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): current_time = math.floor(time.time()-start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_reward = 0
def run(): MODEL_NAME = "explore15_input6" FOLDER = "Best_Dojos" MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" USE_SAVED_MODEL_FILE = False GRID_SIZE = 32 LOCAL_GRID_SIZE = 15 MAP_NUMBER = 0 RANDOMIZE_MAPS = False # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None MAP_PATH = "./Maps/Grid{}/impossible_map1.txt".format(GRID_SIZE, MAP_NUMBER) print("\n ---- Running the Deep Q Network ----- \n") RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 60, food_count = 0, obstacle_count = 0, lava_count = 0, zombie_count = 0, history = 40, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/", trainable = False) brain = Brain(epsilon=0.0, action_space = env.number_of_actions()) model.setup(brain) avg_time = 0 avg_score = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 100 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, np.random.randint(10)) env.set_map(MAP_PATH) # state, info = env.reset() state, info = env.quick_reset() done = False if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="\n") avg_time = 0 avg_score = 0 avg_reward = 0
def train_MetaNetwork(): print("\n ---- Training the Meta Network ----- \n") MODEL_NAME = "meta_grid16_zero_2" MODEL_NAME_save = "meta_grid16_zero_2" DIAMOND_MODEL_NAME = "diamond_grid16_4" ZOMBIE_MODEL_NAME = "zombie_grid16_2" EXPLORE_MODEL_NAME = "explore_grid16_2" # EXTRA_MODEL_NAME = "extra15_input6_2" # MODEL_NAME = "meta15_input6_1M_unfrozen_dojos" # DIAMOND_MODEL_NAME = "diamond15_input4_best_unfrozen_at_1M" # ZOMBIE_MODEL_NAME = "zombie15_input4_best_unfrozen_at_1M" # EXPLORE_MODEL_NAME = "explore15_input4_best_unfrozen_at_1M" # MODEL_NAME = "meta15_input6_1M_random_unfrozen_cointoss" # DIAMOND_MODEL_NAME = "diamond15_input4_1M_random_unfrozen_cointoss" # ZOMBIE_MODEL_NAME = "zombie15_input4_1M_random_unfrozen_cointoss" # EXPLORE_MODEL_NAME = "explore15_input4_1M_random_unfrozen_cointoss"k FOLDER = "Impossible" DOJO_FOLDER = "Impossible" MODEL_PATH_SAVE = "./Models/Tensorflow/"+FOLDER+"/"+MODEL_NAME+"/"+MODEL_NAME+".ckpt" LOGDIR = "./Logs/"+FOLDER+"/"+MODEL_NAME_save+"" USE_SAVED_MODEL_FILE = False GRID_SIZE = 16 LOCAL_GRID_SIZE = 15 MAP_PATH = None RANDOMIZE_MAPS = True RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap = False, grid_size = GRID_SIZE, local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 120, food_count = 0, obstacle_count = 0, lava_count = 0, zombie_count = 0, history = 100, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, path="./Models/Tensorflow/"+FOLDER+"/", load=False, trainable=True) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=True, trainable=False) # extra_net = Network(local_size=LOCAL_GRID_SIZE, name=EXTRA_MODEL_NAME, path="./Models/Tensorflow/"+DOJO_FOLDER+"/", load=False, trainable=True) brain = Brain(epsilon=0.05, action_space=3) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) explore_net.setup(brain) # extra_net.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Histogram histogram = Histogram(3, 10, total_episodes) # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid{}/impossible_map{}.txt".format(GRID_SIZE, np.random.randint(5)) env.set_map(MAP_PATH) # state, info = env.reset() state, info = env.quick_reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=1.0, end=0.05, percentage=0.5) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state}) dojo = brain.choose_action(state, sess, model) histogram.check_section(episode) histogram.add(dojo) # dojo = np.random.randint(3) # dojo = 0 # print(dojo) if dojo == 0: dojo_state = state # dojo_state[2]=0 # dojo_state[3]=0 # dojo_state = np.delete(dojo_state, 2, 0)# Take out the zombie layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, diamond_net, env.number_of_actions(), 0.05) elif dojo == 1: dojo_state = state # dojo_state[1]=0 # dojo_state[3]=0 # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 2, 0)# Take out the history layer action = brain.choose_dojo(dojo_state, sess, zombie_net, env.number_of_actions(), 0.05) elif dojo == 2: dojo_state = state # dojo_state[1]=0 # dojo_state[2]=0 # dojo_state = np.delete(dojo_state, 1, 0)# Take out the diamond layer # dojo_state = np.delete(dojo_state, 1, 0)# Take out the zombie layer action = brain.choose_dojo(dojo_state, sess, explore_net, env.number_of_actions(), 0.05) # elif dojo == 3: # dojo_state = state # action = brain.choose_dojo(dojo_state, sess, extra_net, env.number_of_actions(), 0.05) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition_dojo(state, action, reward, done, new_state, dojo) # print(tf.trainable_variables(scope=None)) # if dojo == 0: # e, Q_vector = brain.train_3_dojos(diamond_net, sess, dojo) # elif dojo == 1: # e, Q_vector = brain.train_3_dojos(zombie_net, sess, dojo) # elif dojo == 2: # e, Q_vector = brain.train_3_dojos(explore_net, sess, dojo) # e, Q_vector = brain.train_3(sess, diamond_net, zombie_net, explore_net) # e, Q_vector = brain.train(extra_net, sess) if done: Dojo_vector[:,dojo] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector y_prime = sess.run(model.q_values, feed_dict={model.input: new_state}) # Equation for training maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime}) # RL Equation Dojo_vector[:,dojo] = reward + (brain.GAMMA * maxq) _, e = sess.run([model.optimizer, model.error], feed_dict={model.input: state, model.actions: Dojo_vector}) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e avg_reward += cumulative_reward cumulative_reward = 0 if (episode%print_episode == 0 and episode != 0) or (episode == total_episodes-1): current_time = math.floor(time.time()-start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time/print_episode), "\tavg score: {0:.3f}".format(avg_score/print_episode), "\tErr {0:.3f}".format(avg_error/print_episode), "\tavg_reward {0:.3f}".format(avg_reward/print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # diamond_net.save(sess, name=DIAMOND_MODEL_NAME+"") # zombie_net.save(sess, name=ZOMBIE_MODEL_NAME+"") # explore_net.save(sess, name=EXPLORE_MODEL_NAME+"") # extra_net.save(sess, name=EXTRA_MODEL_NAME+"") # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Dojo_vector, score:avg_score/print_episode, avg_t:avg_time/print_episode, epsilon:brain.EPSILON, avg_r:avg_reward/print_episode}) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # diamond_net.save(sess, verbose=True, name=DIAMOND_MODEL_NAME+"") # zombie_net.save(sess, verbose=True, name=ZOMBIE_MODEL_NAME+"") # explore_net.save(sess, verbose=True, name=EXPLORE_MODEL_NAME+"") # extra_net.save(sess, verbose=True, name=EXTRA_MODEL_NAME+"") # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path) writer.close() histogram.plot()
local_size = LOCAL_GRID_SIZE, rate = 80, max_time = 120, food_count = 20, stick_count = 0, obstacle_count = 0, lava_count = 0, zombie_count = 0, history = 0, action_space = 5, map_path = MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True, path="./Models/Tensorflow/"+FOLDER+"/") brain = Brain(epsilon=0.1, action_space = env.number_of_actions()) model.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r)
# model_type = 'dqn' model_path = "policies/22-1-2021_13-44/policy0.tar" env = LunarLander() env.reset() exit_program = False if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() state = env.get_state() while not exit_program: env.render() action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) # Process game events for event in pygame.event.get():
def main(env, args): # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: model = load_vae() best_params = np.load('best_params.npy', allow_pickle=True) # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # env.render() # TODO: Choose an action action = decide_action(model, state, best_params) state, reward, done, _ = env.step(action) elif args.DQN: network = Network(env, args) if os.path.exists('dqn.model'): network.model = tf.keras.models.load_model('dqn.model') vae = load_vae() replay_buffer = collections.deque(maxlen=100000) Transition = collections.namedtuple( "Transition", ["state", "action", "reward", "done", "next_state"]) epsilon = 0.25 gamma = 1 for i in tqdm(range(10000)): state, done = env.reset(), False while not done: embedding = vae.get_latent_representation(np.array([state])) q_values = network.predict(embedding)[0] if np.random.uniform() >= epsilon: action = np.argmax(q_values) else: action = np.random.randint(0, env.action_space.n) next_state, reward, done, _ = env.step(action) replay_buffer.append( Transition( embedding, action, reward, done, vae.get_latent_representation(np.array([next_state])))) if len(replay_buffer) > 32: minibatch = random.sample(replay_buffer, 32) states = np.array([t.state[0] for t in minibatch]) actions = np.array([t.action for t in minibatch]) rewards = np.array([t.reward for t in minibatch]) dones = np.array([t.done for t in minibatch]) next_states = np.array( [t.next_state[0] for t in minibatch]) q_values = np.array(network.predict(states)) q_values_next = network.predict(next_states) for Q, action, reward, next_Q, is_done in zip( q_values, actions, rewards, q_values_next, dones): Q[action] = reward + (0 if is_done else gamma * np.max(next_Q)) network.train(states, q_values) if i % 100 == 0: network.update_target_weights() if i % 100 == 0: network.save() state = next_state epsilon = np.exp( np.interp(env.episode + 1, [0, 5000], [np.log(0.25), np.log(0.01)])) elif args.evolution: es = train(load_from='saved_model.pkl') np.save('best_params', es.best.get()[0]) best_params = es.best.get()[0] play(best_params, render=True)
def train(): MODEL_NAME = "diamond9_input5" MODEL_NAME_save = "diamond9_input5" FOLDER = "Best_Dojos9" MODEL_PATH_SAVE = "./Models/Tensorflow/" + FOLDER + "/" + MODEL_NAME_save + "/" + MODEL_NAME_save + ".ckpt" LOGDIR = "./Logs/" + FOLDER + "/" + MODEL_NAME_save + "_2" USE_SAVED_MODEL_FILE = False GRID_SIZE = 8 LOCAL_GRID_SIZE = 9 MAP_NUMBER = 0 RANDOMIZE_MAPS = False # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=10, obstacle_count=0, lava_count=0, zombie_count=0, history=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/" + FOLDER + "/") brain = Brain(epsilon=0.1, action_space=env.number_of_actions()) model.setup(brain) score = tf.placeholder(tf.float32, []) avg_t = tf.placeholder(tf.float32, []) epsilon = tf.placeholder(tf.float32, []) avg_r = tf.placeholder(tf.float32, []) tf.summary.scalar('error', tf.squeeze(model.error)) tf.summary.scalar('score', score) tf.summary.scalar('average time', avg_t) tf.summary.scalar('epsilon', epsilon) tf.summary.scalar('avg reward', avg_r) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 cumulative_reward = 0 # Number of episodes print_episode = 100 total_episodes = 10000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) # for episode in range(50): # state, info = env.reset() # done = False # if RENDER_TO_SCREEN: # env.render() # while not done: # action = brain.choose_action(state, sess, model) # new_state, reward, done, info = env.step(action) # brain.store_transition(state, action, reward, done, new_state) # state = new_state # if RENDER_TO_SCREEN: # env.render() # print("\nREPLAY MEMORY INITIALISED") # print(brain.memCntr) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False # brain.linear_epsilon_decay(total_episodes, episode, start=0.4, end=0.05, percentage=0.8) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) # e, Q_vector = brain.train_batch(4, model, sess) e, Q_vector = brain.train(model, sess) state = new_state cumulative_reward += reward if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e avg_reward += cumulative_reward cumulative_reward = 0 if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print( "Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tavg_reward {0:.3f}".format( avg_reward / print_episode), # avg cumulative reward "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) # Save the model's weights and biases to .npz file model.save(sess, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={ model.input: state, model.actions: Q_vector, score: avg_score / print_episode, avg_t: avg_time / print_episode, epsilon: brain.EPSILON, avg_r: avg_reward / print_episode }) writer.add_summary(s, episode) avg_time = 0 avg_score = 0 avg_error = 0 avg_reward = 0 model.save(sess, verbose=True, name=MODEL_NAME_save) # save_path = saver.save(sess, MODEL_PATH_SAVE) # print("Model saved in path: %s" % save_path) writer.close()
def train(): MODEL_NAME = "diamond_local15_maps" MODEL_PATH_SAVE = "./Models/Tensorflow/Maps/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt" LOGDIR = "./Logs/" + MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 10 LOCAL_GRID_SIZE = 15 MAP_NUMBER = 0 RANDOMIZE_MAPS = True # MAP_PATH = "./Maps/Grid{}/map{}.txt".format(GRID_SIZE, MAP_NUMBER) MAP_PATH = None print("\n ---- Training the Deep Neural Network ----- \n") RENDER_TO_SCREEN = False RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=50, food_count=3, obstacle_count=1, lava_count=1, zombie_count=0, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = Network(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=False, path="./Models/Tensorflow/Maps/") brain = Brain(epsilon=0.05, action_space=env.number_of_actions()) model.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties # writer = tf.summary.FileWriter(LOGDIR) # Assume that you have 12GB of GPU memory and want to allocate ~4GB: gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") sess.run(init) # writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): if RANDOMIZE_MAPS: # Make a random map 0: lava, 1: obstacle MAP_PATH = "./Maps/Grid10/map{}.txt".format( np.random.randint(10)) env.set_map(MAP_PATH) state, info = env.reset() done = False brain.linear_epsilon_decay(total_episodes, episode, start=0.5, end=0.05, percentage=0.6) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form # Q_vector = sess.run(model.q_values, feed_dict={model.input: state}) action = brain.choose_action(state, sess, model) # print(action) # Update environment by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, action, reward, done, new_state) e = brain.train(model, sess) state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npz file model.save(sess) save_path = saver.save(sess, MODEL_PATH_SAVE) # s = sess.run(merged_summary, feed_dict={model.input: state, model.actions: Q_vector}) # writer.add_summary(s, episode) model.save(sess, verbose=True) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path)
def train_MetaNetwork(): print("\n ---- Training the Meta Network ----- \n") MODEL_NAME = "meta_network_local15" DIAMOND_MODEL_NAME = "diamond_dojo_local15" ZOMBIE_MODEL_NAME = "zombie_dojo_local15" # EXPLORE_MODEL_NAME = "explore_dojo_local15" MODEL_PATH_SAVE = "./Models/Tensorflow/" + MODEL_NAME + "/" + MODEL_NAME + ".ckpt" LOGDIR = "./Logs/" + MODEL_NAME USE_SAVED_MODEL_FILE = False GRID_SIZE = 8 LOCAL_GRID_SIZE = 15 MAP_PATH = None RENDER_TO_SCREEN = False # RENDER_TO_SCREEN = True env = Environment(wrap=False, grid_size=GRID_SIZE, local_size=LOCAL_GRID_SIZE, rate=80, max_time=200, food_count=3, obstacle_count=0, lava_count=0, zombie_count=1, action_space=5, map_path=MAP_PATH) if RENDER_TO_SCREEN: env.prerender() model = MetaNetwork(local_size=LOCAL_GRID_SIZE, name=MODEL_NAME, load=True) diamond_net = Network(local_size=LOCAL_GRID_SIZE, name=DIAMOND_MODEL_NAME, load=True, trainable=False) zombie_net = Network(local_size=LOCAL_GRID_SIZE, name=ZOMBIE_MODEL_NAME, load=True, trainable=False) # explore_net = Network(local_size=LOCAL_GRID_SIZE, name=EXPLORE_MODEL_NAME, load=True, trainable = False) brain = Brain(epsilon=0.01, action_space=2) model.setup(brain) diamond_net.setup(brain) zombie_net.setup(brain) # explore_net.setup(brain) tf.summary.scalar('error', tf.squeeze(model.error)) avg_time = 0 avg_score = 0 avg_error = 0 # Number of episodes print_episode = 1000 total_episodes = 100000 saver = tf.train.Saver() # Initialising all variables (weights and biases) init = tf.global_variables_initializer() # Adds a summary graph of the error over time merged_summary = tf.summary.merge_all() # Tensorboard capabilties writer = tf.summary.FileWriter(LOGDIR) # GPU capabilities gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) # Begin session with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: if USE_SAVED_MODEL_FILE: saver.restore(sess, MODEL_PATH_SAVE) print("Model restored.") else: sess.run(init) writer.add_graph(sess.graph) start_time = time.time() print("") for episode in range(total_episodes): state, info = env.reset() done = False brain.linear_epsilon_decay(total_episodes, episode, start=0.3, end=0.02, percentage=0.5) # brain.linear_alpha_decay(total_episodes, episode) if RENDER_TO_SCREEN: env.render() while not done: # Retrieve the Q values from the NN in vector form Dojo_vector = sess.run(model.q_values, feed_dict={model.input: state}) dojo = brain.choose_action(state, sess, model) # print(dojo) if dojo == 0: # state[2] = 0 # Zero out the zombies layer state = np.delete(state, 2, 0) # Take out the zombie layer state = np.delete(state, 5, 0) # Take out the history layer action = brain.choose_dojo(state, sess, diamond_net, env.number_of_actions(), 0.01) elif dojo == 1: # state[1] = 0 # Zero out the diamond layer state = np.delete(state, 1, 0) # Take out the diamond layer state = np.delete(state, 5, 0) # Take out the history layer action = brain.choose_dojo(state, sess, zombie_net, env.number_of_actions(), 0.01) elif dojo == 2: state = np.delete(state, 1, 0) # Take out the diamond layer state = np.delete(state, 2, 0) # Take out the zombie layer action = brain.choose_dojo(state, sess, explore_net, env.number_of_actions(), 0.01) # print(action) # Update environment with by performing action new_state, reward, done, info = env.step(action) # print(new_state) brain.store_transition(state, dojo, reward, done, new_state) ## Standard training with learning after every step # print(tf.trainable_variables(scope=None)) if done: Dojo_vector[:, dojo] = reward # print("Reward:", reward) else: # Gathering the now current state's action-value vector y_prime = sess.run(model.q_values, feed_dict={model.input: new_state}) # Equation for training maxq = sess.run(model.y_prime_max, feed_dict={model.actions: y_prime}) # RL Equation Dojo_vector[:, dojo] = reward + (brain.GAMMA * maxq) _, e = sess.run([model.optimizer, model.error], feed_dict={ model.input: state, model.actions: Dojo_vector }) ## Training using replay memory state = new_state if RENDER_TO_SCREEN: env.render() if done: avg_time += info["time"] avg_score += info["score"] avg_error += e if (episode % print_episode == 0 and episode != 0) or (episode == total_episodes - 1): current_time = math.floor(time.time() - start_time) print("Ep:", episode, "\tavg t: {0:.3f}".format(avg_time / print_episode), "\tavg score: {0:.3f}".format(avg_score / print_episode), "\tErr {0:.3f}".format(avg_error / print_episode), "\tepsilon {0:.3f}".format(brain.EPSILON), end="") print_readable_time(current_time) avg_time = 0 avg_score = 0 avg_error = 0 # Save the model's weights and biases to .npz file model.save(sess) save_path = saver.save(sess, MODEL_PATH_SAVE) s = sess.run(merged_summary, feed_dict={ model.input: state, model.actions: Dojo_vector }) writer.add_summary(s, episode) model.save(sess, verbose=True) save_path = saver.save(sess, MODEL_PATH_SAVE) print("Model saved in path: %s" % save_path) writer.close()