def init_training_algo(self, num_envs=None): """ Initialize the training algorithm. This primarily calls the object creation functions for the A2C or PPO2 and the optimizer, but this also spawns a number of parallel environments, based on the self.num_cpu or num_envs input (if provided). Note, the spawning of parallel environments is VERY slow due to deepcopying the termination sets. I tried some work arounds, but nothing worked properly, so we are stuck with it for now. :param num_envs: an override for the default number of environments to spawn (in self.num_cpu) """ if not num_envs: num_envs = self.num_cpu if self.model_type == "A2C": # check to make sure that the A2C parameters are set assert self.optim_alpha self.training_envs = [deepcopy(self.env) for i in range(num_envs) ] # spawn parallel environments if self.acmodel.recurrent: self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=self.device) self.algo = torch_ac.A2CAlgo( self.training_envs, self.acmodel, self.device, self.frames_per_proc, self.discount, self.lr, self.gae_lambda, self.entropy_coef, self.value_loss_coef, self.max_grad_norm, self.recurrence, self.optim_alpha, self.optim_eps, self.preprocess_obss) elif self.model_type == "PPO2": # check to see if the PPO2 parameters are set assert self.clip_eps and self.epochs and self.batch_size self.training_envs = [deepcopy(self.env) for i in range(num_envs) ] # spawn parallel environments if self.acmodel.recurrent: self.memories = torch.zeros(num_envs, self.acmodel.memory_size, device=self.device) self.algo = torch_ac.PPOAlgo( self.training_envs, self.acmodel, self.device, self.frames_per_proc, self.discount, self.lr, self.gae_lambda, self.entropy_coef, self.value_loss_coef, self.max_grad_norm, self.recurrence, self.optim_eps, self.clip_eps, self.epochs, self.batch_size, self.preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(algo_type)) # load the optimizer state, if it exists if "optimizer_state" in self.status: self.algo.optimizer.load_state_dict(self.status["optimizer_state"]) self.txt_logger.info("Optimizer loaded\n")
elif args.pretrained_gnn: acmodel.load_pretrained_gnn(pretrained_status["model_state"]) txt_logger.info("Pretrained model loaded.\n") acmodel.to(device) txt_logger.info("Model loaded.\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Loading optimizer from existing run.\n") txt_logger.info("Optimizer loaded.\n") # init the evaluator if args.eval: eval_samplers = args.ltl_samplers_eval if args.ltl_samplers_eval else [args.ltl_sampler] eval_env = args.eval_env if args.eval_env else args.env eval_procs = args.eval_procs if args.eval_procs else args.procs
def main(): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument( "--algo", required=True, help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)") parser.add_argument("--env", required=True, help="name of the environment to train on (REQUIRED)") parser.add_argument( "--model", default=None, help="name of the model (default: {ENV}_{ALGO}_{TIME})") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") parser.add_argument("--visualize", default=False, help="show real time CNN layer weight changes") args = parser.parse_args() args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(args.procs): envs.append(utils.make_env(args.env, args.seed + 10000 * i)) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "ppo_intrinsic": algo = torch_ac.PPOAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "a2c_intrinsic": algo = torch_ac.A2CAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() print_visual = args.visualize if print_visual: fig, axs = plt.subplots(1, 3) fig.suptitle('Convolution Layer Weights Normalized Difference') while num_frames < args.frames: # Store copies of s_t model params old_parameters = {} for name, param in acmodel.named_parameters(): old_parameters[name] = param.detach().numpy().copy() # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() # Store copies of s_t+1 model params new_parameters = {} for name, param in acmodel.named_parameters(): new_parameters[name] = param.detach().numpy().copy() # Compute L2 Norm of model state differences # Print model weight change visualization for index in range(len(old_parameters.keys())): if index == 0 or index == 2 or index == 4: key = list(old_parameters.keys())[index] old_weights = old_parameters[key] new_weights = new_parameters[key] norm_diff = numpy.linalg.norm(new_weights - old_weights) diff_matrix = abs(new_weights - old_weights) diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0], norm='max', axis=0) if print_visual: axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0], cmap='Greens', interpolation='nearest') # This allows the plots to update as the model trains if print_visual: plt.ion() plt.show() plt.pause(0.001) num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def main(raw_args=None): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument("--algo", required=True, help="algorithm to use: a2c | ppo | ipo (REQUIRED)") parser.add_argument("--domain1", required=True, help="name of the first domain to train on (REQUIRED)") parser.add_argument( "--domain2", required=True, help="name of the second domain to train on (REQUIRED)") parser.add_argument( "--p1", required=True, type=float, help="Proportion of training environments from first domain (REQUIRED)" ) parser.add_argument("--model", required=True, help="name of the model") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") args = parser.parse_args(raw_args) args.mem = args.recurrence > 1 # Check PyTorch version if (torch.__version__ != '1.2.0'): raise ValueError( "PyTorch version must be 1.2.0 (see README). Your version is {}.". format(torch.__version__)) if args.mem: raise ValueError("Policies with memory not supported.") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = args.model model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources torch.backends.cudnn.deterministic = True utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments from different domains domain1 = args.domain1 # e.g., 'MiniGrid-ColoredKeysRed-v0' domain2 = args.domain2 # e.g., 'MiniGrid-ColoredKeysYellow-v0' p1 = args.p1 # Proportion of environments from domain1 num_envs_total = args.procs # Total number of environments num_domain1 = math.ceil( p1 * num_envs_total) # Number of environments in domain1 num_domain2 = num_envs_total - num_domain1 # Number of environments in domain2 # Environments from domain1 envs1 = [] for i in range(num_domain1): envs1.append(utils.make_env(domain1, args.seed + 10000 * i)) # Environments from domain2 envs2 = [] for i in range(num_domain2): envs2.append(utils.make_env(domain2, args.seed + 10000 * i)) # All environments envs = envs1 + envs2 txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") if args.algo == "ipo": # Load model for IPO game acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) else: # Load model (for standard PPO or A2C) acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ipo": # One algo per domain. These have different envivonments, but shared acmodel algo1 = torch_ac.IPOAlgo( envs1, acmodel, 1, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) algo2 = torch_ac.IPOAlgo( envs2, acmodel, 2, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state1" in status: algo1.optimizer.load_state_dict(status["optimizer_state1"]) txt_logger.info("Optimizer 1 loaded\n") if "optimizer_state2" in status: algo2.optimizer.load_state_dict(status["optimizer_state2"]) txt_logger.info("Optimizer 2 loaded\n") else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() if args.algo == "ipo": # Standard method # Collect experiences on first domain exps1, logs_exps1 = algo1.collect_experiences() # Update params of model corresponding to first domain logs_algo1 = algo1.update_parameters(exps1) # Collect experiences on second domain exps2, logs_exps2 = algo2.collect_experiences() # Update params of model corresponding to second domain logs_algo2 = algo2.update_parameters(exps2) # Update end time update_end_time = time.time() # Combine logs logs_exps = { 'return_per_episode': logs_exps1["return_per_episode"] + logs_exps2["return_per_episode"], 'reshaped_return_per_episode': logs_exps1["reshaped_return_per_episode"] + logs_exps2["reshaped_return_per_episode"], 'num_frames_per_episode': logs_exps1["num_frames_per_episode"] + logs_exps2["num_frames_per_episode"], 'num_frames': logs_exps1["num_frames"] + logs_exps2["num_frames"] } logs_algo = { 'entropy': (num_domain1 * logs_algo1["entropy"] + num_domain2 * logs_algo2["entropy"]) / num_envs_total, 'value': (num_domain1 * logs_algo1["value"] + num_domain2 * logs_algo2["value"]) / num_envs_total, 'policy_loss': (num_domain1 * logs_algo1["policy_loss"] + num_domain2 * logs_algo2["policy_loss"]) / num_envs_total, 'value_loss': (num_domain1 * logs_algo1["value_loss"] + num_domain2 * logs_algo2["value_loss"]) / num_envs_total, 'grad_norm': (num_domain1 * logs_algo1["grad_norm"] + num_domain2 * logs_algo2["grad_norm"]) / num_envs_total } logs = {**logs_exps, **logs_algo} num_frames += logs["num_frames"] else: exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() # header += ["debug_last_env_reward"] # data += [logs["debug_last_env_reward"]] header += ["total_loss"] data += [ logs["policy_loss"] - args.entropy_coef * logs["entropy"] + args.value_loss_coef * logs["value_loss"] ] if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: if args.algo == "ipo": status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state1": algo1.optimizer.state_dict(), "optimizer_state2": algo2.optimizer.state_dict() } else: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def step(self, action): self.step_count += 1 reward = 0 done = False # Get the position in front of the agent fwd_pos = self.front_pos # Get the contents of the cell in front of the agent fwd_cell = self.grid.get(*fwd_pos) #print(action) # Rotate left if action == self.actions.left: self.agent_dir -= 1 if self.agent_dir < 0: self.agent_dir += 4 # Rotate right elif action == self.actions.right: self.agent_dir = (self.agent_dir + 1) % 4 # Move forward elif action == self.actions.forward: if fwd_cell == None or fwd_cell.can_overlap(): self.agent_pos = fwd_pos if fwd_cell != None and fwd_cell.type == 'goal' and not self.is_teaching: done = True reward = self._reward() if fwd_cell != None and fwd_cell.type == 'lava': done = True # Pick up an object elif action == self.actions.pickup: if fwd_cell and fwd_cell.can_pickup(): if self.carrying is None: self.carrying = fwd_cell self.carrying.cur_pos = np.array([-1, -1]) self.grid.set(*fwd_pos, None) # Drop an object elif action == self.actions.drop: if not fwd_cell and self.carrying: self.grid.set(*fwd_pos, self.carrying) self.carrying.cur_pos = fwd_pos self.carrying = None # Toggle/activate an object elif action == self.actions.toggle: if fwd_cell: fwd_cell.toggle(self, fwd_pos) # Done action (not used by default) elif action == self.actions.done: if self.step_count >= self.min_steps and self.is_teaching: done = True else: pass else: assert False, "unknown action" if self.step_count >= self.max_steps: done = True obs = self.gen_obs() if done and self.is_teaching: student_return_avg = [] envs = [] for i in range(self.args.procs): env = gym.make(self.args.env) env.seed(self.args.seed) env.is_teaching = False env.end_pos = self.agent_pos envs.append(env) update = 0 num_frames = 0 # md_index = np.random.choice(range(len(self.student_hist_models)),1,p=self.sampling_dist(len(self.student_hist_models),strategy=self.args.sampling_strategy))[0] # if np.random.random() < self.args.historical_averaging and not self.args.intra: # md = copy.deepcopy(self.student_hist_models[md_index]) # else: # md = self.student_hist_models[md_index] md = self.student_hist_models[-1] # while num_frames < self.args.frames and update<5: while update < self.args.s_iters_per_teaching: device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") algo = torch_ac.PPOAlgo( envs, md, device, self.args.frames_per_proc, self.args.discount, self.args.lr, self.args.gae_lambda, self.args.entropy_coef, self.args.value_loss_coef, self.args.max_grad_norm, self.args.recurrence, self.args.optim_eps, self.args.clip_eps, self.args.epochs, self.args.batch_size, self.preprocess_obss) algo.args = self.args if self.args.intra: algo.historical_models = self.student_hist_models update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] student_return_avg.append( self.synthesize( logs["reshaped_return_per_episode"])["mean"]) update += 1 print(update, end=",") # status = {"num_frames": num_frames, "update": update, # "model_state": md.state_dict(), "optimizer_state": algo.optimizer.state_dict()} # if hasattr(self.preprocess_obss, "vocab"): # status["vocab"] = self.preprocess_obss.vocab.vocab # utils.save_status(status, self.model_dir) if not self.args.intra: self.student_hist_models.append(copy.deepcopy(md)) if np.random.random( ) < self.args.historical_averaging and not self.args.intra: # self.student_hist_models.append(md) md_index = np.random.choice( range(len(self.student_hist_models)), 1, p=self.sampling_dist( len(self.student_hist_models), strategy=self.args.sampling_strategy))[0] md = copy.deepcopy(self.student_hist_models[md_index]) if update % self.args.log_interval == 0 and False: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize( logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += [ "rreturn_" + key for key in rreturn_per_episode.keys() ] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += [ "return_" + key for key in return_per_episode.keys() ] data += return_per_episode.values() if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) reward = max( [0, self._reward() - numpy.average(student_return_avg)]) #logs = {**logs1, **logs2} #rreturn_per_episode = self.synthesize(logs["reshaped_return_per_episode"]) #print(rreturn_per_episode) return obs, reward, done, {"agent_pos": self.agent_pos}
def train( self, model: torch.nn.Module, env_factory: EnvironmentFactory ) -> (torch.nn.Module, TrainingStatistics): """ Train the network. :param model: the network to train :param env_factory: environment factory :return: the trained network, and a list of dicts which contain the statistics for training """ model = model.to(self.config.device) model.train() # put network into training mode if self.config.inst_in_worker: # Note: this is prototype code for something like pygame... envs = [(copy.deepcopy(env_factory), env_cfg) for env_cfg in self.config.train_cfgs] else: envs = [ env_factory.new_environment(env_cfg) for env_cfg in self.config.train_cfgs ] num_frames_done = 0 # use prog_bar or print statements as we like prog_bar = tqdm(total=self.config.num_frames, desc="{} frames out of at least {} completed".format( 0, self.config.num_frames)) if self.config.algorithm == 'ppo': algo = torch_ac.PPOAlgo( envs, model, device=self.config.device, num_frames_per_proc=self.config.max_num_frames_rollout, # Note: variable here name is misleading, this will actually be exactly the number # of frames that will be collected per rollout discount=self.config.discount, lr=self.config.learning_rate, gae_lambda=self.config.gae_lambda, entropy_coef=self.config.entropy_coef, value_loss_coef=self.config.value_loss_coef, max_grad_norm=self.config.max_grad_norm, recurrence=self.config. recurrence, # this last variable must be set to 1 for # non-recurrent models -- a torch_ac implementation detail adam_eps=self.config.adam_eps, clip_eps=self.config.clip_eps, epochs=self.config.num_epochs, batch_size=self.config.batch_size, preprocess_obss=self.config.preprocess_obss, reshape_reward=self.config.reshape_reward) else: raise NotImplementedError("Currently only PPO is supported") # intermediate testing setup frames_per_test = self.config.num_frames_per_test test_frames = frames_per_test checkpoint_frames = self.config.num_frames_per_checkpoint ts = TrainingStatistics(train_info=self.config.train_info_dict()) batch_num = 1 start_time = time.time() early_stop = False while num_frames_done < self.config.num_frames and not early_stop: exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) num_frames_done += logs1['num_frames'] prog_bar.update(logs1['num_frames']) prog_bar.set_description( desc="{} frames out of at least {} completed".format( num_frames_done, self.config.num_frames)) ts.add_batch_stats( BatchTrainingStatistics(batch_num, logs2['entropy'], logs2['value'], logs2['policy_loss'], logs2['value_loss'], logs2['grad_norm'])) if num_frames_done > test_frames and self.config.intermediate_test_cfgs: agg_results = self._test(model, env_factory, intermediate=True) model.train() ts.add_agent_run_stats(agg_results) test_frames += frames_per_test # Note: we should try to ensure this is not large # might also be worth making a function/object for this if self.config.early_stop is not None: early_stop = self.config.early_stop( aggregated_test_results=agg_results, logs1=logs1, logs2=logs2, optimizer_cfg=self.config) else: early_stop = self._default_early_stop( aggregated_test_results=agg_results, logs1=logs1, logs2=logs2, optimizer_cfg=self.config) if early_stop: ts.train_info['early_stop_frames'] = num_frames_done if checkpoint_frames is not None and self.config.checkpoint_dir is not None and num_frames_done > checkpoint_frames: fname = 'model.pt' # NOTE: we don't move the model off the device to save, b/c I think it would just # be more efficient to load it directly onto cpu using torch's map_location directive # than to move the model from device to cpu and back to device every time a checkpoint # occurs model_state_dict = model.state_dict() # TODO: save the optimizer state. This requires an update to the `torch_ac.PPOAlgo`. Here, a # state_dict() method and load_state_dict() method should be implemented. This task is reflected # in the ticket: output_dict = dict(model_state_dict=model_state_dict, num_frames=num_frames_done) torch.save(output_dict, os.path.join(self.config.checkpoint_dir, fname)) checkpoint_frames += self.config.num_frames_per_checkpoint batch_num += 1 train_time = time.time() - start_time ts.add_train_time(train_time) return model, ts
add_agents(env) env.on_reset = add_agents env.before_reset = configure_board envs.append(env) try: model = torch.load(MODEL_PATH) model.eval() print('Loaded Model (%s)' % MODEL_PATH) except: model = FourKeysAgentModel(envs[0].action_space) print('New Model Created') observation_preprocessor = model.get_observation_preprocessor() algo = torch_ac.PPOAlgo(envs, model, preprocess_obss=observation_preprocessor) def full_evaluation(model, visualize=False, evaluation_opponents=3, rounds=1): env = gym.make(ENVIRONMENT) env.seed(42) def eval_add_agents(coordinator): for i in range(evaluation_opponents): coordinator.add_agent(RandomAgent('Random Agent [%i]' % i)) def eval_configure_board(coordinator): coordinator.shared_state_initialization = dict(side_length=15, num_seed_walls=6, wall_growth_factor=4)
def step(self, action): self.step_count += 1 reward = 0 done = False # Get the position in front of the agent fwd_pos = self.front_pos # Get the contents of the cell in front of the agent fwd_cell = self.grid.get(*fwd_pos) #print(action) # Rotate left if action == self.actions.left: self.agent_dir -= 1 if self.agent_dir < 0: self.agent_dir += 4 # Rotate right elif action == self.actions.right: self.agent_dir = (self.agent_dir + 1) % 4 # Move forward elif action == self.actions.forward: if fwd_cell == None or fwd_cell.can_overlap(): self.agent_pos = fwd_pos if fwd_cell != None and fwd_cell.type == 'goal' and not self.is_teaching: done = True reward = self._reward() if fwd_cell != None and fwd_cell.type == 'lava': done = True # Pick up an object elif action == self.actions.pickup: if fwd_cell and fwd_cell.can_pickup(): if self.carrying is None: self.carrying = fwd_cell self.carrying.cur_pos = np.array([-1, -1]) self.grid.set(*fwd_pos, None) # Drop an object elif action == self.actions.drop: if not fwd_cell and self.carrying: self.grid.set(*fwd_pos, self.carrying) self.carrying.cur_pos = fwd_pos self.carrying = None # Toggle/activate an object elif action == self.actions.toggle: if fwd_cell: fwd_cell.toggle(self, fwd_pos) # Done action (not used by default) elif action == self.actions.done: if self.step_count >= self.min_steps and self.is_teaching: done = True else: pass else: assert False, "unknown action" if self.step_count >= self.max_steps: done = True obs = self.gen_obs() if done and self.is_teaching: student_return_avg = [] for _ in range(1): envs = [] for i in range(self.args.procs): env = gym.make(self.args.env) env.seed(self.args.seed) env.is_teaching = False env.end_pos = self.agent_pos envs.append(env) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") algo = torch_ac.PPOAlgo( envs, self.acmodel, device, self.args.frames_per_proc, self.args.discount, self.args.lr, self.args.gae_lambda, self.args.entropy_coef, self.args.value_loss_coef, self.args.max_grad_norm, self.args.recurrence, self.args.optim_eps, self.args.clip_eps, self.args.epochs, self.args.batch_size, self.preprocess_obss) update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() student_return_avg.append( self.synthesize( logs["reshaped_return_per_episode"])["mean"]) reward = max( [0, self._reward() - numpy.average(student_return_avg)]) #logs = {**logs1, **logs2} #rreturn_per_episode = self.synthesize(logs["reshaped_return_per_episode"]) #print(rreturn_per_episode) return obs, reward, done, {"agent_pos": self.agent_pos}
txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, 1, args.adam_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, utils.reshape_reward) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters
acmodel = ACModel(envs[0].observation_space, envs[0].action_space, memory, False) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if algorithm == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, 5, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, optim_alpha, optim_eps, preprocess_obss) elif algorithm == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, 128, discount, lr, gae_lambda, entropy_coef, value_loss_coef, max_grad_norm, recurrence, optim_eps, clip_eps, epochs, batch_size, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(algorithm)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < frames: