def eval(self): """ Abstract evaluation flow """ print("EVALUATION RUN") print("No exploration and dropout will be used") self.arglist.exploration_bonus = 0.0 self.arglist.init_noise_scale = 0.0 self.arglist.dropout_p = 0.0 if self.arglist.load_models is None: print("WARNING: Evaluation run without loading any models!") # set random seeds before model creation if self.arglist.seed is not None: random.seed(self.arglist.seed) np.random.seed(self.arglist.seed) torch.manual_seed(self.arglist.seed) torch.cuda.manual_seed(self.arglist.seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False env, env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions = ( self.create_environment()) self.env = env self.n_agents = n_agents print("Observation sizes: ", observation_sizes) print("Action sizes: ", action_sizes) # Create curiosity instances if self.arglist.curiosity is None: print("No curiosity is to be used!") elif self.arglist.curiosity == "icm": print("Training uses Intrinsic Curiosity Module (ICM)!") elif self.arglist.curiosity == "rnd": print("Training uses Random Network Distillation (RND)!") elif self.arglist.curiosity == "count": print("Training uses hash-based counting exploration bonus!") # TODO: add count based ones else: raise ValueError("Unknown curiosity: " + self.arglist.curiosity) # create algorithm trainer if self.arglist.alg == "maddpg": self.alg = MADDPG(n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist) print( "Training multi-agent deep deterministic policy gradient (MADDPG) on " + env_name + " environment") elif self.arglist.alg == "iql": self.alg = IQL(n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist) print("Training independent q-learning (IQL) on " + env_name + " environment") else: raise ValueError("Unknown algorithm: " + self.arglist.alg) # set random seeds past model creation if self.arglist.seed is not None: random.seed(self.arglist.seed) np.random.seed(self.arglist.seed) torch.manual_seed(self.arglist.seed) torch.cuda.manual_seed(self.arglist.seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if self.arglist.load_models is not None: print("Loading models from " + self.arglist.load_models + " with extension " + self.arglist.load_models_extension) self.alg.load_model_networks( self.arglist.load_models, "_" + self.arglist.load_models_extension) self.logger = Logger(n_agents, task_name, None, self.arglist.alg, self.arglist.curiosity) self.plotter = Plotter( self.logger, n_agents, task_name, self.arglist.run, self.arglist.alg, self.arglist.curiosity, ) if self.arglist.save_gifs: self.frame_saver = FrameSaver(self.arglist.eta, task_name, self.arglist.run, self.arglist.alg) print("Starting iterations...") start_time = time.time() t = 0 for ep in range(self.arglist.num_episodes): episode_rewards = [] obs = self.reset_environment() self.alg.reset(ep) episode_rewards = np.array([0.0] * n_agents) episode_length = 0 done = False while not done and episode_length < self.arglist.max_episode_len: torch_obs = [ Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(n_agents) ] actions, agent_actions = self.select_actions(torch_obs) rewards, dones, next_obs = self.environment_step(actions) t += 1 episode_rewards += rewards # for displaying learned policies self.environment_render() if self.arglist.save_gifs: self.frame_saver.add_frame( self.env.render("rgb_array")[0], ep) obs = next_obs episode_length += 1 done = all(dones) if self.arglist.alg == "maddpg": self.logger.log_episode( ep, episode_rewards, [0.0] * n_agents, self.alg.agents[0].get_exploration_scale(), ) if self.arglist.alg == "iql": self.logger.log_episode( ep, episode_rewards, [0.0] * n_agents, self.alg.agents[0].epsilon, ) self.logger.dump_episodes(1) episode_rewards = [] episode_length = 0 if self.arglist.save_gifs: self.frame_saver.save_episode_gif() if ep % 20 == 0 and ep > 0: # update plots self.plotter.update_reward_plot(True) self.plotter.update_exploration_plot(True) duration = time.time() - start_time print("Overall duration: %.2fs" % duration) env.close()
def train(self): """ Abstract training flow """ # set random seeds before model creation if self.arglist.seed is not None: random.seed(self.arglist.seed) np.random.seed(self.arglist.seed) torch.manual_seed(self.arglist.seed) torch.cuda.manual_seed(self.arglist.seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # use number of threads if no GPUs are available if not USE_CUDA: torch.set_num_threads(self.arglist.n_training_threads) env, env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions = ( self.create_environment()) self.env = env self.n_agents = n_agents steps = self.arglist.num_episodes * self.arglist.max_episode_len # steps-th root of GOAL_EPSILON decay_epsilon = GOAL_EPSILON**(1 / float(steps)) self.arglist.decay_factor = decay_epsilon print("Epsilon is decaying with factor %.7f to %.3f over %d steps." % (decay_epsilon, GOAL_EPSILON, steps)) print("Observation sizes: ", observation_sizes) print("Action sizes: ", action_sizes) # Create curiosity instances if self.arglist.curiosity is None: print("No curiosity is to be used!") elif self.arglist.curiosity == "icm": print("Training uses Intrinsic Curiosity Module (ICM)!") elif self.arglist.curiosity == "rnd": print("Training uses Random Network Distillation (RND)!") elif self.arglist.curiosity == "count": print("Training uses hash-based counting exploration bonus!") else: raise ValueError("Unknown curiosity: " + self.arglist.curiosity) # create algorithm trainer if self.arglist.alg == "maddpg": self.alg = MADDPG(n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist) print( "Training multi-agent deep deterministic policy gradient (MADDPG) on " + env_name + " environment") elif self.arglist.alg == "iql": self.alg = IQL(n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist) print("Training independent q-learning (IQL) on " + env_name + " environment") else: raise ValueError("Unknown algorithm: " + self.arglist.alg) self.memory = ReplayBuffer( self.arglist.buffer_capacity, n_agents, observation_sizes, action_sizes, self.arglist.no_rewards, ) # set random seeds past model creation if self.arglist.seed is not None: random.seed(self.arglist.seed) np.random.seed(self.arglist.seed) torch.manual_seed(self.arglist.seed) torch.cuda.manual_seed(self.arglist.seed) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if self.arglist.load_models is not None: print("Loading models from " + self.arglist.load_models + " with extension " + self.arglist.load_models_extension) self.alg.load_model_networks( self.arglist.load_models, "_" + self.arglist.load_models_extension) self.model_saver = ModelSaver(self.arglist.save_models_dir, self.arglist.run, self.arglist.alg) self.logger = Logger( n_agents, self.arglist.eta, task_name, self.arglist.run, self.arglist.alg, self.arglist.curiosity, ) self.plotter = Plotter( self.logger, n_agents, self.arglist.eval_frequency, task_name, self.arglist.run, self.arglist.alg, self.arglist.curiosity, ) if self.arglist.save_frames: self.frame_saver = FrameSaver(self.arglist.eta, task_name, self.arglist.run, self.arglist.alg) print("Starting iterations...") start_time = time.time() t = 0 for ep in range(self.arglist.num_episodes): obs = self.reset_environment() self.alg.reset(ep) episode_rewards = np.array([0.0] * n_agents) if self.arglist.sparse_rewards: sparse_rewards = np.array([0.0] * n_agents) episode_length = 0 done = False interesting_episode = False while not done and episode_length < self.arglist.max_episode_len: torch_obs = [ Variable(torch.Tensor(obs[i]), requires_grad=False) for i in range(n_agents) ] actions, agent_actions = self.select_actions( torch_obs, not self.arglist.no_exploration) rewards, dones, next_obs = self.environment_step(actions) episode_rewards += rewards if self.arglist.sparse_rewards: sparse_rewards += rewards if self.arglist.no_rewards: rewards = [0.0] * n_agents elif self.arglist.sparse_rewards: if (episode_length + 1) % self.arglist.sparse_freq == 0: rewards = list(sparse_rewards / self.arglist.sparse_freq) else: rewards = [0.0] * n_agents self.memory.push(obs, agent_actions, rewards, next_obs, dones) t += 1 if (len(self.memory) >= self.arglist.batch_size and (t % self.arglist.steps_per_update) == 0): losses = self.alg.update(self.memory, USE_CUDA) self.logger.log_losses(ep, losses) if self.arglist.dump_losses: self.logger.dump_losses(1) # for displaying learned policies if self.arglist.display: self.environment_render() if self.arglist.save_frames: self.frame_saver.add_frame( self.env.render("rgb_array")[0], ep) if self.arglist.curiosity is not None: curiosities = self.alg.get_curiosities( obs, agent_actions, next_obs) interesting = self.frame_saver.save_interesting_frame( curiosities) interesting_episode = interesting_episode or interesting obs = next_obs episode_length += 1 done = all(dones) if ep % self.arglist.eval_frequency == 0: eval_rewards = np.zeros((self.arglist.eval_episodes, n_agents)) for i in range(self.arglist.eval_episodes): ep_rewards, _, _ = self.eval(ep, n_agents) eval_rewards[i, :] = ep_rewards if self.arglist.alg == "maddpg": self.logger.log_episode( ep, eval_rewards.mean(0), eval_rewards.var(0), self.alg.agents[0].get_exploration_scale(), ) if self.arglist.alg == "iql": self.logger.log_episode(ep, eval_rewards.mean(0), eval_rewards.var(0), self.alg.agents[0].epsilon) self.logger.dump_episodes(1) if ep % 100 == 0 and ep > 0: duration = time.time() - start_time self.logger.dump_train_progress(ep, self.arglist.num_episodes, duration) if interesting_episode: self.frame_saver.save_episode_gif() if ep % (self.arglist.save_interval // 2) == 0 and ep > 0: # update plots self.plotter.update_reward_plot(self.arglist.plot) self.plotter.update_exploration_plot(self.arglist.plot) self.plotter.update_alg_loss_plot(self.arglist.plot) if self.arglist.curiosity is not None: self.plotter.update_cur_loss_plot(self.arglist.plot) self.plotter.update_intrinsic_reward_plot( self.arglist.plot) if ep % self.arglist.save_interval == 0 and ep > 0: # save plots print("Remove previous plots") self.plotter.clear_plots() print("Saving intermediate plots") self.plotter.save_reward_plot(str(ep)) self.plotter.save_exploration_plot(str(ep)) self.plotter.save_alg_loss_plots(str(ep)) self.plotter.save_cur_loss_plots(str(ep)) self.plotter.save_intrinsic_reward_plot(str(ep)) # save models print("Remove previous models") self.model_saver.clear_models() print("Saving intermediate models") self.model_saver.save_models(self.alg, str(ep)) # save logs print("Remove previous logs") self.logger.clear_logs() print("Saving intermediate logs") self.logger.save_episodes(extension=str(ep)) self.logger.save_losses(extension=str(ep)) # save parameter log self.logger.save_parameters( env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist, ) duration = time.time() - start_time print("Overall duration: %.2fs" % duration) print("Remove previous plots") self.plotter.clear_plots() print("Saving final plots") self.plotter.save_reward_plot("final") self.plotter.save_exploration_plot("final") self.plotter.save_alg_loss_plots("final") self.plotter.save_cur_loss_plots("final") self.plotter.save_intrinsic_reward_plot("final") # save models print("Remove previous models") self.model_saver.clear_models() print("Saving final models") self.model_saver.save_models(self.alg, "final") # save logs print("Remove previous logs") self.logger.clear_logs() print("Saving final logs") self.logger.save_episodes(extension="final") self.logger.save_losses(extension="final") self.logger.save_duration_cuda(duration, torch.cuda.is_available()) # save parameter log self.logger.save_parameters( env_name, task_name, n_agents, observation_sizes, action_sizes, discrete_actions, self.arglist, ) env.close()