help="add a GRU to the model to handle text input") args = parser.parse_args() args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") model_name = args.model model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pretrained_model_dir = None if args.pretrained_gnn: assert(args.progression_mode == "full") default_dir = f"symbol-storage/{args.gnn}-dumb_ac_{args.ltl_sampler}_Simple-LTL-Env-v0_seed:{args.seed}_*_prog:{args.progression_mode}/train" print(default_dir) model_dirs = glob.glob(default_dir) if len(model_dirs) == 0: raise Exception("Pretraining directory not found.") elif len(model_dirs) > 1: raise Exception("More than 1 candidate pretraining directory found.") pretrained_model_dir = model_dirs[0] # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir + "/train") csv_file, csv_logger = utils.get_csv_logger(model_dir + "/train") tb_writer = tensorboardX.SummaryWriter(model_dir + "/train") utils.save_config(model_dir + "/train", args) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device
def main(): # Parse arguments parser = argparse.ArgumentParser() args = parser.parse_args() ## General parameters parser.add_argument("--algo", required=True, help="algorithm to use: a2c | ppo (REQUIRED)") parser.add_argument("--env", required=True, help="name of the environment to train on (REQUIRED)") parser.add_argument("--model", default=None, help="name of the model (default: {ENV}_{ALGO}_{TIME})") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument("--save-interval", type=int, default=10, help="number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument("--frames-per-proc", type=int, default=None, help="number of frames per process before update (default: 5 for A2C and 128 for PPO)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument("--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)") parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--recurrence", type=int, default=1, help="number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory.") parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") parser.add_argument("--argmax", action="store_true", default=False, help="select the action with highest probability (default: False)") if len(sys.argv) > 1: args = parser.parse_args() else: args.env = 'MiniGrid-DoorKey-5x5-v0' args.env = 'MiniGrid-KeyCorridorGBLA-v0' args.algo = 'ppo' args.seed = 1234 args.model = 'KeyCorridor2' args.frames = 2e5 args.procs = 16 args.text = False args.frames_per_proc = None args.discount = 0.99 args.lr = 0.001 args.gae_lambda = 0.95 args.entropy_coef = 0.01 args.value_loss_coef = 0.5 args.max_grad_norm = 0.5 args.recurrence = 1 args.optim_eps = 1e-8 args.optim_alpha = 0.99 args.clip_eps = 0.2 args.epochs = 4 args.batch_size = 256 args.log_interval = 1 args.save_interval = 10 args.argmax = False if args.env == 'MiniGrid-KeyCorridorGBLA-v0': env_descriptor = [[0,0,0],[0,13,0],[0,0,0]] task_descriptor = TaskDescriptor(envD=env_descriptor, rmDesc=None, rmOrder=None, rmSize=4, observ=True, seed=None, time_steps=None) env = gym.make('MiniGrid-KeyCorridorGBLA-v0', taskD=task_descriptor) goal = GetGoalDescriptor(env) goal = goal.refinement[0].refinement[0].refinement[0] env = gym_minigrid.wrappers.FullyObsWrapper(env) env = gym_minigrid.wrappers.ImgObsWrapper(env) env = GoalRL.GoalEnvWrapper(env,goal=goal, verbose=0) # env = Monitor(env, 'storage/{}/{}.monitor.csv'.format(rank, goal.goalId)) # wrap the environment in the monitor object args.env = env else: pass args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device # Load environments envs = [] for i in range(args.procs): if type(args.env) == str: envs.append(utils.make_env(args.env, args.seed + 10000 * i)) else: envs.append(deepcopy(args.env)) txt_logger.info("Environments loaded\n") # Load training status # Load observations preprocessor #obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) # Load model agent = utils.Agent(env, model_dir, logger=txt_logger, argmax=args.argmax, use_memory=args.mem, use_text=args.text) # Load algo if args.algo == 'a2c': agent.init_training_algo(algo_type=args.algo, num_cpu=args.procs, frames_per_proc=args.frames_per_proc, discount=args.discount, lr=args.lr, gae_lambda=args.gae_lambda, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, recurrence=args.recurrence, optim_eps=args.optim_eps, optim_alpha=args.optim_alpha) # args for A2C elif args.algo == 'ppo': agent.init_training_algo(algo_type=args.algo, num_cpu=args.procs, frames_per_proc=args.frames_per_proc, discount=args.discount, lr=args.lr, gae_lambda=args.gae_lambda, entropy_coef=args.entropy_coef, value_loss_coef=args.value_loss_coef, max_grad_norm=args.max_grad_norm, recurrence=args.recurrence, optim_eps=args.optim_eps, clip_eps=args.clip_eps, # args for PPO2 epochs=args.epochs, batch_size=args.batch_size) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) agent.learn(total_timesteps=args.frames, log_interval=args.log_interval, save_interval=args.save_interval) print('training completed!')
def main(): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument( "--algo", required=True, help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)") parser.add_argument("--env", required=True, help="name of the environment to train on (REQUIRED)") parser.add_argument( "--model", default=None, help="name of the model (default: {ENV}_{ALGO}_{TIME})") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") parser.add_argument("--visualize", default=False, help="show real time CNN layer weight changes") args = parser.parse_args() args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(args.procs): envs.append(utils.make_env(args.env, args.seed + 10000 * i)) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "ppo_intrinsic": algo = torch_ac.PPOAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "a2c_intrinsic": algo = torch_ac.A2CAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() print_visual = args.visualize if print_visual: fig, axs = plt.subplots(1, 3) fig.suptitle('Convolution Layer Weights Normalized Difference') while num_frames < args.frames: # Store copies of s_t model params old_parameters = {} for name, param in acmodel.named_parameters(): old_parameters[name] = param.detach().numpy().copy() # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() # Store copies of s_t+1 model params new_parameters = {} for name, param in acmodel.named_parameters(): new_parameters[name] = param.detach().numpy().copy() # Compute L2 Norm of model state differences # Print model weight change visualization for index in range(len(old_parameters.keys())): if index == 0 or index == 2 or index == 4: key = list(old_parameters.keys())[index] old_weights = old_parameters[key] new_weights = new_parameters[key] norm_diff = numpy.linalg.norm(new_weights - old_weights) diff_matrix = abs(new_weights - old_weights) diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0], norm='max', axis=0) if print_visual: axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0], cmap='Greens', interpolation='nearest') # This allows the plots to update as the model trains if print_visual: plt.ion() plt.show() plt.pause(0.001) num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def main(raw_args=None): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument("--algo", required=True, help="algorithm to use: a2c | ppo | ipo (REQUIRED)") parser.add_argument("--domain1", required=True, help="name of the first domain to train on (REQUIRED)") parser.add_argument( "--domain2", required=True, help="name of the second domain to train on (REQUIRED)") parser.add_argument( "--p1", required=True, type=float, help="Proportion of training environments from first domain (REQUIRED)" ) parser.add_argument("--model", required=True, help="name of the model") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") args = parser.parse_args(raw_args) args.mem = args.recurrence > 1 # Check PyTorch version if (torch.__version__ != '1.2.0'): raise ValueError( "PyTorch version must be 1.2.0 (see README). Your version is {}.". format(torch.__version__)) if args.mem: raise ValueError("Policies with memory not supported.") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = args.model model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources torch.backends.cudnn.deterministic = True utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments from different domains domain1 = args.domain1 # e.g., 'MiniGrid-ColoredKeysRed-v0' domain2 = args.domain2 # e.g., 'MiniGrid-ColoredKeysYellow-v0' p1 = args.p1 # Proportion of environments from domain1 num_envs_total = args.procs # Total number of environments num_domain1 = math.ceil( p1 * num_envs_total) # Number of environments in domain1 num_domain2 = num_envs_total - num_domain1 # Number of environments in domain2 # Environments from domain1 envs1 = [] for i in range(num_domain1): envs1.append(utils.make_env(domain1, args.seed + 10000 * i)) # Environments from domain2 envs2 = [] for i in range(num_domain2): envs2.append(utils.make_env(domain2, args.seed + 10000 * i)) # All environments envs = envs1 + envs2 txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") if args.algo == "ipo": # Load model for IPO game acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) else: # Load model (for standard PPO or A2C) acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ipo": # One algo per domain. These have different envivonments, but shared acmodel algo1 = torch_ac.IPOAlgo( envs1, acmodel, 1, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) algo2 = torch_ac.IPOAlgo( envs2, acmodel, 2, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state1" in status: algo1.optimizer.load_state_dict(status["optimizer_state1"]) txt_logger.info("Optimizer 1 loaded\n") if "optimizer_state2" in status: algo2.optimizer.load_state_dict(status["optimizer_state2"]) txt_logger.info("Optimizer 2 loaded\n") else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() if args.algo == "ipo": # Standard method # Collect experiences on first domain exps1, logs_exps1 = algo1.collect_experiences() # Update params of model corresponding to first domain logs_algo1 = algo1.update_parameters(exps1) # Collect experiences on second domain exps2, logs_exps2 = algo2.collect_experiences() # Update params of model corresponding to second domain logs_algo2 = algo2.update_parameters(exps2) # Update end time update_end_time = time.time() # Combine logs logs_exps = { 'return_per_episode': logs_exps1["return_per_episode"] + logs_exps2["return_per_episode"], 'reshaped_return_per_episode': logs_exps1["reshaped_return_per_episode"] + logs_exps2["reshaped_return_per_episode"], 'num_frames_per_episode': logs_exps1["num_frames_per_episode"] + logs_exps2["num_frames_per_episode"], 'num_frames': logs_exps1["num_frames"] + logs_exps2["num_frames"] } logs_algo = { 'entropy': (num_domain1 * logs_algo1["entropy"] + num_domain2 * logs_algo2["entropy"]) / num_envs_total, 'value': (num_domain1 * logs_algo1["value"] + num_domain2 * logs_algo2["value"]) / num_envs_total, 'policy_loss': (num_domain1 * logs_algo1["policy_loss"] + num_domain2 * logs_algo2["policy_loss"]) / num_envs_total, 'value_loss': (num_domain1 * logs_algo1["value_loss"] + num_domain2 * logs_algo2["value_loss"]) / num_envs_total, 'grad_norm': (num_domain1 * logs_algo1["grad_norm"] + num_domain2 * logs_algo2["grad_norm"]) / num_envs_total } logs = {**logs_exps, **logs_algo} num_frames += logs["num_frames"] else: exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() # header += ["debug_last_env_reward"] # data += [logs["debug_last_env_reward"]] header += ["total_loss"] data += [ logs["policy_loss"] - args.entropy_coef * logs["entropy"] + args.value_loss_coef * logs["value_loss"] ] if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: if args.algo == "ipo": status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state1": algo1.optimizer.state_dict(), "optimizer_state2": algo2.optimizer.state_dict() } else: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def tuner(icm_lr, reward_weighting, normalise_rewards, args): import argparse import datetime import torch import torch_ac import tensorboardX import sys import numpy as np from model import ACModel from .a2c import A2CAlgo # from .ppo import PPOAlgo frames_to_visualise = 200 # Parse arguments args.mem = args.recurrence > 1 def make_exploration_heatmap(args, plot_title): import numpy as np import matplotlib.pyplot as plt visitation_counts = np.load( f"{args.model}_visitation_counts.npy", allow_pickle=True ) plot_title = str(np.count_nonzero(visitation_counts)) + args.model plt.imshow(np.log(visitation_counts)) plt.colorbar() plt.title(plot_title) plt.savefig(f"{plot_title}_visitation_counts.png") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = "cpu" # torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(16): an_env = utils.make_env( args.env, int(args.frames_before_reset), int(args.environment_seed) ) envs.append(an_env) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo # adapted from impact driven RL from .models import AutoencoderWithUncertainty autoencoder = AutoencoderWithUncertainty(observation_shape=(7, 7, 3)).to(device) autoencoder_opt = torch.optim.Adam( autoencoder.parameters(), lr=icm_lr, weight_decay=0 ) if args.algo == "a2c": algo = A2CAlgo( envs, acmodel, autoencoder, autoencoder_opt, args.uncertainty, args.noisy_tv, args.curiosity, args.randomise_env, args.uncertainty_budget, args.environment_seed, reward_weighting, normalise_rewards, args.frames_before_reset, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss, None, args.random_action, ) elif args.algo == "ppo": algo = PPOAlgo( envs, acmodel, autoencoder, autoencoder_opt, args.uncertainty, args.noisy_tv, args.curiosity, args.randomise_env, args.uncertainty_budget, args.environment_seed, reward_weighting, normalise_rewards, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, ) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 log_to_wandb(logs, start_time, update_start_time, update_end_time) # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += ["num_frames_" + key for key in num_frames_per_episode.keys()] data += num_frames_per_episode.values() header += [ "intrinsic_rewards", "uncertainties", "novel_states_visited", "entropy", "value", "policy_loss", "value_loss", "grad_norm", ] data += [ logs["intrinsic_rewards"].mean().item(), logs["uncertainties"].mean().item(), logs["novel_states_visited"].mean().item(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"], ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f}".format( *data ) ) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict(), } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) return