def __init__( self, env, eval_env=None, image_size=(45, 45, 45), update_frequency=4, replay_buffer_size=1e6, init_memory_size=5e4, max_episodes=100, steps_per_episode=50, eps=1, min_eps=0.1, delta=0.001, batch_size=4, gamma=0.9, number_actions=6, frame_history=4, model_name="CommNet", logger=None, train_freq=1, team_reward=False, attention=False, ): self.env = env self.eval_env = eval_env self.agents = env.agents self.image_size = image_size self.update_frequency = update_frequency self.replay_buffer_size = replay_buffer_size self.init_memory_size = init_memory_size self.max_episodes = max_episodes self.steps_per_episode = steps_per_episode self.eps = eps self.min_eps = min_eps self.delta = delta self.batch_size = batch_size self.gamma = gamma self.number_actions = number_actions self.frame_history = frame_history self.epoch_length = self.env.files.num_files self.best_val_distance = float('inf') self.buffer = ReplayMemory(self.replay_buffer_size, self.image_size, self.frame_history, self.agents) self.dqn = DQN(self.agents, self.frame_history, logger=logger, type=model_name, collective_rewards=team_reward, attention=attention) self.dqn.q_network.train(True) self.evaluator = Evaluator(eval_env, self.dqn.q_network, logger, self.agents, steps_per_episode) self.logger = logger self.train_freq = train_freq
header=True) # Parameters for training a DQN model N_EPISODE = 10000 #The number of episodes for training MAX_STEP = 1000 #The number of steps for each episode BATCH_SIZE = 32 #The number of experiences for each replay MEMORY_SIZE = 100000 #The size of the batch for storing experiences SAVE_NETWORK = 100 # After this number of episodes, the DQN model is saved for testing later. INITIAL_REPLAY_SIZE = 1000 #The number of experiences are stored in the memory batch before starting replaying INPUTNUM = 198 #The number of input values for the DQN model ACTIONNUM = 6 #The number of actions output from the DQN model MAP_MAX_X = 21 #Width of the Map MAP_MAX_Y = 9 #Height of the Map # Initialize a DQN model and a memory batch for storing experiences DQNAgent = DQN(INPUTNUM, ACTIONNUM) memory = Memory(MEMORY_SIZE) # Initialize environment minerEnv = MinerEnv( HOST, PORT ) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease. #Training Process #the main part of the deep-q learning agorithm for episode_i in range(0, N_EPISODE): try: # Choosing a map in the list mapID = np.random.randint(
[-2,-2,0,-2,0,-2,600,-2,-3,-3,-2,100,-3,-2,-2,0,-3,-3,0,0,0], [0,-3,-3,-2,0,0,-1,0,0,-3,-2,0,0,100,-1,0,0,-1,-1,-1,-2], [-2,850,1100,0,-1,100,-1,450,1050,-3,-2,0,-3,350,0,0,-1,-3,-3,-2,-1], [-1,-3,-1,-3,0,-2,0,0,-2,-1,0,-3,400,-2,0,-3,700,-2,-3,-2,0], [-2, -3,-1,-3,-1,0,-1,-3,-2,-1,300,-1,0,-1,200,-1,150,-2,-3,-3,-1], [0, -3, -1, -3, 0,-2,-3,-3,0,0,0,0,-2,300,-2,-3,-3,-3,-3,0,-1], [0,-3,-1,-3,-1,-1,-2,-2,0,-1,0,-2,0,-2,0,0,-2,-3,-3,0,0] ] Map = np.array(map0.copy()) index = np.where(Map==0) listOfCoordinates= list(zip(index[0], index[1])) ''' # Initialize a DQN model and a memory batch for # storing experiences weight_path='/home/mayleo/Documents/Inreforcement learning/miner/TrainedModels/DQNmodel_MinerLoss_ep600.h5' DQNAgent = DQN(input_image_dim, ACTIONNUM, gamma = 0.95,epsilon =epsilon, learning_rate = 0.01, load_weights=None) memory = Memory(MEMORY_SIZE) # Initialize environment minerEnv = MinerEnv(HOST, PORT) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game path = '/home/mayleo/Documents/Inreforcement learning/miner/Maps/' train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease. #Training Process #the main part of the deep-q learning agorithm for episode_i in range(0, N_EPISODE): print('*****') try: # Choosing a map in the list mapID = np.random.randint(0, 1) #Choosing a map ID from 5 maps in Maps folder randomly maplist = [0,25,50,75,100]
pd.DataFrame(columns=header).to_csv(f, encoding='utf-8', index=False, header=True) # Parameters for training a DQN model N_EPISODE = 100000 #The number of episodes for training MAX_STEP = 1000 #The number of steps for each episode BATCH_SIZE = 32 #The number of experiences for each replay MEMORY_SIZE = 100000 #The size of the batch for storing experiences SAVE_NETWORK = 1000 # After this number of episodes, the DQN model is saved for testing later. INITIAL_REPLAY_SIZE = 10000 #The number of experiences are stored in the memory batch before starting replaying INPUTNUM = (2*limit+1)**2 + 3#198 #The number of input values for the DQN model ACTIONNUM = 6 #The number of actions output from the DQN model MAP_MAX_X = 21 #Width of the Map MAP_MAX_Y = 9 #Height of the Map # Initialize a DQN model and a memory batch for storing experiences DQNAgent = DQN(INPUTNUM, ACTIONNUM) memory = Memory(MEMORY_SIZE) bots = [Bot1(2), Bot2(3), Bot3(4)] #load model to continue training if args.load_model !="": file_name = "TrainedModels/DQNmodel_20200730-1832_ep1000out-30.json" json_file = file_name if args.load_model == "default" else args.load_model DQNAgent.load_model(json_file) # Initialize environment minerEnv = MinerEnv(HOST, PORT) #Creating a communication environment between the DQN model and the game environment (GAME_SOCKET_DUMMY.py) minerEnv.start() # Connect to the game train = False #The variable is used to indicate that the replay starts, and the epsilon starts decrease. #Training Process #the main part of the deep-q learning agorithm
init_player = MedicalPlayer( files_list=args.files, file_type=args.file_type, landmark_ids=args.landmarks, screen_dims=IMAGE_SIZE, # TODO: why is this always play? task='play', agents=agents, logger=logger) NUM_ACTIONS = init_player.action_space.n if args.task != 'train': # TODO: refactor DQN to not have to create both a q_network and # target_network dqn = DQN(agents, frame_history=FRAME_HISTORY, logger=logger, type=args.model_name) model = dqn.q_network model.load_state_dict(torch.load(args.load, map_location=model.device)) environment = get_player(files_list=args.files, file_type=args.file_type, landmark_ids=args.landmarks, saveGif=args.saveGif, saveVideo=args.saveVideo, task=args.task, agents=agents, viz=args.viz, logger=logger) evaluator = Evaluator(environment, model, logger, agents, args.steps_per_episode) evaluator.play_n_episodes()
class Trainer(object): def __init__( self, env, eval_env=None, image_size=(45, 45, 45), update_frequency=4, replay_buffer_size=1e6, init_memory_size=5e4, max_episodes=100, steps_per_episode=50, eps=1, min_eps=0.1, delta=0.001, batch_size=4, gamma=0.9, number_actions=6, frame_history=4, model_name="CommNet", logger=None, train_freq=1, team_reward=False, attention=False, ): self.env = env self.eval_env = eval_env self.agents = env.agents self.image_size = image_size self.update_frequency = update_frequency self.replay_buffer_size = replay_buffer_size self.init_memory_size = init_memory_size self.max_episodes = max_episodes self.steps_per_episode = steps_per_episode self.eps = eps self.min_eps = min_eps self.delta = delta self.batch_size = batch_size self.gamma = gamma self.number_actions = number_actions self.frame_history = frame_history self.epoch_length = self.env.files.num_files self.best_val_distance = float('inf') self.buffer = ReplayMemory(self.replay_buffer_size, self.image_size, self.frame_history, self.agents) self.dqn = DQN(self.agents, self.frame_history, logger=logger, type=model_name, collective_rewards=team_reward, attention=attention) self.dqn.q_network.train(True) self.evaluator = Evaluator(eval_env, self.dqn.q_network, logger, self.agents, steps_per_episode) self.logger = logger self.train_freq = train_freq def train(self): self.logger.log(self.dqn.q_network) self.init_memory() episode = 1 acc_steps = 0 epoch_distances = [] while episode <= self.max_episodes: # Reset the environment for the start of the episode. obs = self.env.reset() terminal = [False for _ in range(self.agents)] losses = [] score = [0] * self.agents for step_num in range(self.steps_per_episode): acc_steps += 1 acts, q_values = self.get_next_actions( self.buffer.recent_state()) # Step the agent once, and get the transition tuple obs, reward, terminal, info = self.env.step( np.copy(acts), q_values, terminal) score = [sum(x) for x in zip(score, reward)] self.buffer.append((obs, acts, reward, terminal)) if acc_steps % self.train_freq == 0: mini_batch = self.buffer.sample(self.batch_size) loss = self.dqn.train_q_network(mini_batch, self.gamma) losses.append(loss) if all(t for t in terminal): break epoch_distances.append( [info['distError_' + str(i)] for i in range(self.agents)]) self.append_episode_board(info, score, "train", episode) if (episode * self.epoch_length) % self.update_frequency == 0: self.dqn.copy_to_target_network() self.eps = max(self.min_eps, self.eps - self.delta) # Every epoch if episode % self.epoch_length == 0: self.append_epoch_board(epoch_distances, self.eps, losses, "train", episode) self.validation_epoch(episode) self.dqn.save_model(name="latest_dqn.pt", forced=True) self.dqn.scheduler.step() epoch_distances = [] episode += 1 def init_memory(self): self.logger.log("Initialising memory buffer...") pbar = tqdm(desc="Memory buffer", total=self.init_memory_size) while len(self.buffer) < self.init_memory_size: # Reset the environment for the start of the episode. obs = self.env.reset() terminal = [False for _ in range(self.agents)] steps = 0 for _ in range(self.steps_per_episode): steps += 1 acts, q_values = self.get_next_actions(obs) obs, reward, terminal, info = self.env.step( acts, q_values, terminal) self.buffer.append((obs, acts, reward, terminal)) if all(t for t in terminal): break pbar.update(steps) pbar.close() self.logger.log("Memory buffer filled") def validation_epoch(self, episode): if self.eval_env is None: return self.dqn.q_network.train(False) epoch_distances = [] for k in range(self.eval_env.files.num_files): self.logger.log(f"eval episode {k}") (score, start_dists, q_values, info) = self.evaluator.play_one_episode() epoch_distances.append( [info['distError_' + str(i)] for i in range(self.agents)]) val_dists = self.append_epoch_board(epoch_distances, name="eval", episode=episode) if (val_dists < self.best_val_distance): self.logger.log("Improved new best mean validation distances") self.best_val_distance = val_dists self.dqn.save_model(name="best_dqn.pt", forced=True) self.dqn.q_network.train(True) def append_episode_board(self, info, score, name="train", episode=0): dists = { str(i): info['distError_' + str(i)] for i in range(self.agents) } self.logger.write_to_board(f"{name}/dist", dists, episode) scores = {str(i): score[i] for i in range(self.agents)} self.logger.write_to_board(f"{name}/score", scores, episode) def append_epoch_board(self, epoch_dists, eps=0, losses=[], name="train", episode=0): epoch_dists = np.array(epoch_dists) if name == "train": self.logger.write_to_board(name, {"eps": eps}, episode) if len(losses) > 0: loss_dict = {"loss": sum(losses) / len(losses)} self.logger.write_to_board(name, loss_dict, episode) for i in range(self.agents): mean_dist = sum(epoch_dists[:, i]) / len(epoch_dists[:, i]) mean_dist_dict = {str(i): mean_dist} self.logger.write_to_board(f"{name}/mean_dist", mean_dist_dict, episode) min_dist_dict = {str(i): min(epoch_dists[:, i])} self.logger.write_to_board(f"{name}/min_dist", min_dist_dict, episode) max_dist_dict = {str(i): max(epoch_dists[:, i])} self.logger.write_to_board(f"{name}/max_dist", max_dist_dict, episode) return np.array(list(mean_dist_dict.values())).mean() def get_next_actions(self, obs_stack): # epsilon-greedy policy if np.random.random() < self.eps: q_values = np.zeros((self.agents, self.number_actions)) actions = np.random.randint(self.number_actions, size=self.agents) else: actions, q_values = self.get_greedy_actions(obs_stack, doubleLearning=True) return actions, q_values def get_greedy_actions(self, obs_stack, doubleLearning=True): inputs = torch.tensor(obs_stack).unsqueeze(0) if doubleLearning: q_vals = self.dqn.q_network.forward(inputs).detach().squeeze(0) else: q_vals = self.dqn.target_network.forward(inputs).detach().squeeze( 0) idx = torch.max(q_vals, -1)[1] greedy_steps = np.array(idx, dtype=np.int32).flatten() return greedy_steps, q_vals.data.numpy()
INITIAL_REPLAY_SIZE = 100 # The number of experiences are stored in the memory batch before starting replaying INPUTNUM = 198 # The number of input values for the DQN model ACTIONNUM = 6 # The number of actions output from the DQN model MAP_MAX_X = 21 # Width of the Map MAP_MAX_Y = 9 # Height of the Map load_checkpoint = False # Initialize a DQN model and a memory batch for storing experiences DQNAgent = DQN( INPUTNUM, ACTIONNUM, batch_size=BATCH_SIZE, mem_size=50000, eps_min=0.1, replace=1000, eps_dec=1e-5, chkpt_dir="models/", algo="dqnagent", env_name="minerai", gamma=0.99, epsilon=1, lr=0.00001, ) if load_checkpoint: DQNAgent.load_models() # Initialize environment minerEnv = MinerEnv(HOST, PORT) minerEnv.start() fname = (DQNAgent.algo + "_" + DQNAgent.env_name + "_lr" + str(DQNAgent.lr) +
error_message = f"""Wrong input files {len(args.files)} for {args.task} task - should be 1 \'images.txt\' """ assert len(args.files) == 1, (error_message) else: error_message = f"""Wrong input files {len(args.files)} for {args.task} task - should be 2 [\'images.txt\', \'landmarks.txt\'] """ assert len(args.files) == 2, (error_message) if args.seed is not None: set_reproducible(args.seed) logger = Logger(args.log_dir, args.write, args.save_freq, comment=args.log_comment) if args.task != 'train': dqn = DQN(agents, frame_history=FRAME_HISTORY, logger=logger, type=args.model_name, collective_rewards=args.team_reward, attention=args.attention) model = dqn.q_network model.load_state_dict(torch.load(args.load, map_location=model.device)) environment = get_player(files_list=args.files, file_type=args.file_type, landmark_ids=args.landmarks, saveGif=args.saveGif, saveVideo=args.saveVideo, task=args.task, agents=agents, viz=args.viz, logger=logger) evaluator = Evaluator(environment, model, logger, agents, args.steps_per_episode) evaluator.play_n_episodes(fixed_spawn=args.fixed_spawn) else: # train model