def log_to_wandb(logs, start_time, update_start_time, update_end_time): fps = logs["num_frames"] / (update_end_time - update_start_time) wandb.log({"fps": fps}) duration = int(time.time() - start_time) wandb.log({"duration": duration}) return_per_episode = utils.synthesize(logs["return_per_episode"]) wandb.log({"return_per_episode": return_per_episode}) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) wandb.log({"rreturn_per_episode": rreturn_per_episode}) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) wandb.log({"number_frames_per_episode": num_frames_per_episode}) for a_key in rreturn_per_episode.keys(): wandb.log({"rreturn_" + a_key: rreturn_per_episode[a_key]}) wandb.log({"num_frames_" + a_key: num_frames_per_episode[a_key]}) wandb.log({"intrinsic_rewards": logs["intrinsic_rewards"].mean().item()}) wandb.log({"uncertainties": logs["uncertainties"].mean().item()}) wandb.log({"novel_states_visited": logs["novel_states_visited"].max().item()}) wandb.log({"entropy": logs["entropy"]}) wandb.log({"value": logs["value"]}) wandb.log({"policy_loss": logs["policy_loss"]}) wandb.log({"value_loss": logs["value_loss"]}) wandb.log({"grad_norm": logs["grad_norm"]})
def main(raw_args=None): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument("--algo", required=True, help="algorithm to use: a2c | ppo | ipo (REQUIRED)") parser.add_argument("--domain1", required=True, help="name of the first domain to train on (REQUIRED)") parser.add_argument( "--domain2", required=True, help="name of the second domain to train on (REQUIRED)") parser.add_argument( "--p1", required=True, type=float, help="Proportion of training environments from first domain (REQUIRED)" ) parser.add_argument("--model", required=True, help="name of the model") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") args = parser.parse_args(raw_args) args.mem = args.recurrence > 1 # Check PyTorch version if (torch.__version__ != '1.2.0'): raise ValueError( "PyTorch version must be 1.2.0 (see README). Your version is {}.". format(torch.__version__)) if args.mem: raise ValueError("Policies with memory not supported.") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = args.model model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources torch.backends.cudnn.deterministic = True utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments from different domains domain1 = args.domain1 # e.g., 'MiniGrid-ColoredKeysRed-v0' domain2 = args.domain2 # e.g., 'MiniGrid-ColoredKeysYellow-v0' p1 = args.p1 # Proportion of environments from domain1 num_envs_total = args.procs # Total number of environments num_domain1 = math.ceil( p1 * num_envs_total) # Number of environments in domain1 num_domain2 = num_envs_total - num_domain1 # Number of environments in domain2 # Environments from domain1 envs1 = [] for i in range(num_domain1): envs1.append(utils.make_env(domain1, args.seed + 10000 * i)) # Environments from domain2 envs2 = [] for i in range(num_domain2): envs2.append(utils.make_env(domain2, args.seed + 10000 * i)) # All environments envs = envs1 + envs2 txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") if args.algo == "ipo": # Load model for IPO game acmodel = ACModel_average(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) else: # Load model (for standard PPO or A2C) acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") elif args.algo == "ipo": # One algo per domain. These have different envivonments, but shared acmodel algo1 = torch_ac.IPOAlgo( envs1, acmodel, 1, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) algo2 = torch_ac.IPOAlgo( envs2, acmodel, 2, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) if "optimizer_state1" in status: algo1.optimizer.load_state_dict(status["optimizer_state1"]) txt_logger.info("Optimizer 1 loaded\n") if "optimizer_state2" in status: algo2.optimizer.load_state_dict(status["optimizer_state2"]) txt_logger.info("Optimizer 2 loaded\n") else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() if args.algo == "ipo": # Standard method # Collect experiences on first domain exps1, logs_exps1 = algo1.collect_experiences() # Update params of model corresponding to first domain logs_algo1 = algo1.update_parameters(exps1) # Collect experiences on second domain exps2, logs_exps2 = algo2.collect_experiences() # Update params of model corresponding to second domain logs_algo2 = algo2.update_parameters(exps2) # Update end time update_end_time = time.time() # Combine logs logs_exps = { 'return_per_episode': logs_exps1["return_per_episode"] + logs_exps2["return_per_episode"], 'reshaped_return_per_episode': logs_exps1["reshaped_return_per_episode"] + logs_exps2["reshaped_return_per_episode"], 'num_frames_per_episode': logs_exps1["num_frames_per_episode"] + logs_exps2["num_frames_per_episode"], 'num_frames': logs_exps1["num_frames"] + logs_exps2["num_frames"] } logs_algo = { 'entropy': (num_domain1 * logs_algo1["entropy"] + num_domain2 * logs_algo2["entropy"]) / num_envs_total, 'value': (num_domain1 * logs_algo1["value"] + num_domain2 * logs_algo2["value"]) / num_envs_total, 'policy_loss': (num_domain1 * logs_algo1["policy_loss"] + num_domain2 * logs_algo2["policy_loss"]) / num_envs_total, 'value_loss': (num_domain1 * logs_algo1["value_loss"] + num_domain2 * logs_algo2["value_loss"]) / num_envs_total, 'grad_norm': (num_domain1 * logs_algo1["grad_norm"] + num_domain2 * logs_algo2["grad_norm"]) / num_envs_total } logs = {**logs_exps, **logs_algo} num_frames += logs["num_frames"] else: exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() # header += ["debug_last_env_reward"] # data += [logs["debug_last_env_reward"]] header += ["total_loss"] data += [ logs["policy_loss"] - args.entropy_coef * logs["entropy"] + args.value_loss_coef * logs["value_loss"] ] if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: if args.algo == "ipo": status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state1": algo1.optimizer.state_dict(), "optimizer_state2": algo2.optimizer.state_dict() } else: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def run(full_args: Namespace, return_models: bool = False): if sys.argv[0].startswith("train"): import os full_args.out_dir = os.path.dirname(sys.argv[1]) args = full_args.main agent_args = full_args.agent model_args = full_args.model extra_logs = getattr(full_args, "extra_logs", None) main_r_key = getattr(full_args, "main_r_key", None) if args.seed == 0: args.seed = full_args.run_id + 1 max_eprews = args.max_eprews max_eprews_window = getattr(args, "max_eprews_window", 1) post_process_args(agent_args) post_process_args(model_args) model_dir = getattr(args, "model_dir", full_args.out_dir) print(model_dir) # ============================================================================================== # @ torc_rl repo original # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) tb_writer = None if args.tb: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(model_dir) # Log command and all script arguments logger.info("{}\n".format(" ".join(sys.argv))) logger.info("{}\n".format(args)) # ============================================================================================== # Set seed for all randomness sources utils.seed(args.seed) # ============================================================================================== # Generate environments envs = [] # Get env wrappers - must be a list of elements wrapper_method = getattr(full_args.env_cfg, "wrapper", None) if wrapper_method is None: def idem(x): return x env_wrapper = idem else: env_wrappers = [getattr(gym_wrappers, w_p) for w_p in wrapper_method] def env_wrapp(w_env): for wrapper in env_wrappers[::-1]: w_env = wrapper(w_env) return w_env env_wrapper = env_wrapp actual_procs = getattr(args, "actual_procs", None) no_actions = getattr(full_args.env_cfg, "no_actions", 6) if actual_procs: # Split envs in chunks no_envs = args.procs envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, n_actions=no_actions) first_env = envs[0][0] print( f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master" ) else: for i in range(args.procs): env = env_wrapper(gym.make(args.env)) env.max_steps = full_args.env_cfg.max_episode_steps env.seed(args.seed + 10000 * i) envs.append(env) first_env = envs[0] # Generate evaluation envs eval_envs = [] eval_episodes = getattr(full_args.env_cfg, "eval_episodes", 0) if full_args.env_cfg.no_eval_envs > 0: no_envs = full_args.env_cfg.no_eval_envs eval_envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, n_actions=no_actions) # Define obss preprocessor max_image_value = full_args.env_cfg.max_image_value normalize_img = full_args.env_cfg.normalize permute = getattr(full_args.env_cfg, "permute", False) obss_preprocessor = getattr(full_args.env_cfg, "obss_preprocessor", None) obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, first_env.observation_space, model_dir, max_image_value=max_image_value, normalize=normalize_img, permute=permute, type=obss_preprocessor) first_obs = first_env.reset() if "state" in first_obs: full_state_size = first_obs["state"].shape # Add full size shape add_to_cfg(full_args, MAIN_CFG_ARGS, "full_state_size", full_state_size) if "position" in first_obs: position_size = first_obs["position"].shape # Add full size shape add_to_cfg(full_args, MAIN_CFG_ARGS, "position_size", position_size) # Add the width and height of environment for position estimation model_args.width = first_env.unwrapped.width model_args.height = first_env.unwrapped.height # ============================================================================================== # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} saver = utils.SaveData(model_dir, save_best=args.save_best, save_all=args.save_all) model, agent_data, other_data = None, dict(), None try: # Continue from last point model, agent_data, other_data = saver.load_training_data(best=False) logger.info("Training data exists & loaded successfully\n") except OSError: logger.info("Could not load training data\n") # ============================================================================================== # Load Model if model is None: model = get_model(model_args, obs_space, first_env.action_space, use_memory=model_args.mem) logger.info(f"Model [{model_args.name}] successfully created\n") # Print Model info logger.info("{}\n".format(model)) if torch.cuda.is_available(): model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # ============================================================================================== # Load Agent algo = get_agent(full_args.agent, envs, model, agent_data, preprocess_obss=preprocess_obss, reshape_reward=None, eval_envs=eval_envs, eval_episodes=eval_episodes) has_evaluator = hasattr(algo, "evaluate") and full_args.env_cfg.no_eval_envs > 0 if return_models: return algo, model, envs, saver # ============================================================================================== # Train model prev_rewards = [] crt_eprew = 0 if "eprew" in other_data: crt_eprew = other_data["eprew"] num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] update_start_time = time.time() while num_frames < args.frames: # Update model parameters logs = algo.update_parameters() num_frames += logs["num_frames"] update += 1 if update % args.eval_interval == 0 and has_evaluator: eval_logs = algo.evaluate(eval_key=main_r_key) logs.update(eval_logs) prev_start_time = update_start_time update_start_time = time.time() # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_start_time - prev_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += ["entropy", "value", "policy_loss", "value_loss"] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"] ] header += ["grad_norm"] data += [logs["grad_norm"]] # add log fields that are not in the standard log format (for example value_int) extra_fields = extra_log_fields(header, list(logs.keys())) header.extend(extra_fields) data += [logs[field] for field in extra_fields] # print to stdout the standard log fields + fields required in config keys_format, printable_data = print_keys(header, data, extra_logs) logger.info(keys_format.format(*printable_data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if args.tb: for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) status = {"num_frames": num_frames, "update": update} if main_r_key is None: crt_eprew = list(rreturn_per_episode.values())[0] prev_rewards.append(crt_eprew) else: crt_eprew = logs[main_r_key] prev_rewards.append(logs[main_r_key]) # -- Save vocabulary and model if args.save_interval > 0 and update % args.save_interval == 0: preprocess_obss.vocab.save() saver.save_training_data(model, algo.get_save_data(), crt_eprew) logger.info("Model successfully saved") utils.save_status(status, model_dir) check_rew = np.mean(prev_rewards[-max_eprews_window:]) if len(prev_rewards) > max_eprews_window and check_rew > max_eprews: print( f"Reached mean return {max_eprews} for a window of {max_eprews_window} steps" ) exit()
while num_frames < args.frames: # Update model parameters update_start_time = time.time() logs = algo.collect_experiences() update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["rewards"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() header += ["policy_loss"] data += [np.mean(logs["loss"])] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | pL {:.3f}" .format(*data)) if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data)
def train_i2a_model(environment_class, # name of the environment to train on (REQUIRED) environment_model_name, # class algorithm, imagination_steps, seed=1, # random seed (default: 1) procs=16, # number of processes (default: 16) frames=10 ** 7, # number of frames of training (default: 10e7) log_interval=1, # number of updates between two logs (default: 1) save_interval=10, # number of updates between two saves (default: 0, 0 means no saving) frames_per_proc=None, # number of frames per process before update (default: 5 for A2C and 128 for PPO) discount=0.99, # discount factor (default: 0.99) lr=7e-4, # learning rate for optimizers (default: 7e-4) gae_lambda=0.95, # lambda coefficient in GAE formula (default: 0.95, 1 means no gae) entropy_coef=0.01, # entropy term coefficient (default: 0.01) value_loss_coef=0.5, # value loss term coefficient (default: 0.5) max_grad_norm=0.5, # maximum norm of gradient (default: 0.5) recurrence=1, # number of steps the gradient is propagated back in time (default: 1) optim_eps=1e-5, # Adam and RMSprop optimizer epsilon (default: 1e-5) optim_alpha=0.99, # RMSprop optimizer apha (default: 0.99) clip_eps=0.2, # clipping epsilon for PPO (default: 0.2) epochs=4, # number of epochs for PPO (default: 4) batch_size=256, # batch size for PPO (default: 256) no_instr=False, # don't use instructions in the model no_mem=False, # don't use memory in the model note=None, # name suffix tensorboard=True): saved_arguments = locals() date_suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") note = note + "_" if note else "" model_name = "I2A-{}_{}{}_s{}_{}".format(imagination_steps, note, environment_name(environment_class), seed, date_suffix) model_dir = utils.get_model_dir(model_name) # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) if tensorboard: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(model_dir) # Log command and all script arguments logger.info("{}\n".format(saved_arguments)) # Set seed for all randomness sources utils.seed(seed) # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} # Define actor-critic model num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] environment_model = utils.load_model(utils.get_model_dir(environment_model_name)) i2a_model = I2AModel(environment_class, environment_model, imagination_steps) algorithm.load_acmodel(i2a_model) logger.info("Using environment model: {}\n".format(environment_model_name)) logger.info("{}\n".format(environment_model)) logger.info("Agent architecture:\n") logger.info("{}\n".format(i2a_model)) while num_frames < frames: # Update model parameters update_start_time = time.time() logs = algorithm.update_parameters() update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += ["num_frames_" + key for key in num_frames_per_episode.keys()] data += num_frames_per_episode.values() header += ["entropy", "value", "policy_loss", "value_loss", "grad_norm", "distillation_loss"] data += [logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"], logs["distillation_loss"]] logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f} | dL {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if tensorboard: for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) status = {"num_frames": num_frames, "update": update} utils.save_status(status, model_dir) # Save vocabulary and model if save_interval > 0 and update % save_interval == 0: utils.save_model(algorithm.acmodel, model_dir) logger.info("Model successfully saved")
def run_eval(): envs = [] for i in range(1): env = utils.make_env(args.env, args.seed + 10000 * i) env.is_teaching = False env.end_pos = args.eval_goal envs.append(env) env = ParallelEnv(envs) # Load agent model_dir = utils.get_model_dir(args.model) agent = utils.Agent(env.observation_space, env.action_space, model_dir, device, args.argmax, args.procs) # Initialize logs logs = {"num_frames_per_episode": [], "return_per_episode": []} # Run agent start_time = time.time() obss = env.reset() log_done_counter = 0 log_episode_return = torch.zeros(args.procs, device=device) log_episode_num_frames = torch.zeros(args.procs, device=device) positions = [] while log_done_counter < args.episodes: actions = agent.get_actions(obss) obss, rewards, dones, infos = env.step(actions) positions.extend([info["agent_pos"] for info in infos]) agent.analyze_feedbacks(rewards, dones) log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float) log_episode_num_frames += torch.ones(args.procs, device=device) for i, done in enumerate(dones): if done: log_done_counter += 1 logs["return_per_episode"].append(log_episode_return[i].item()) logs["num_frames_per_episode"].append(log_episode_num_frames[i].item()) mask = 1 - torch.tensor(dones, device=device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames/(end_time - start_time) duration = int(end_time - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) print("Eval: F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}" .format(num_frames, fps, duration, *return_per_episode.values(), *num_frames_per_episode.values())) return return_per_episode
log_episode_num_frames[i].item()) logs["events_per_episode"].append(log_events[i]) log_events[i] = [] mask = 1 - torch.tensor(dones, device=device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) duration = int(end_time - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) print( "F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}" .format(num_frames, fps, duration, *return_per_episode.values(), *num_frames_per_episode.values())) # Print worst episodes n = args.worst_episodes_to_show if n > 0: print("\n{} worst episodes:".format(n)) indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k])
update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"]/(update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) success_per_episode = utils.synthesize([1 if r > 0 else 0 for r in logs["return_per_episode"]]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration", "goals"] data = [update, num_frames, fps, duration, len(list(algo.goals.keys()))] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += ["num_frames_" + key for key in num_frames_per_episode.keys()] data += num_frames_per_episode.values() header += ["entropy", "value", "success_rate", "policy_loss", "value_loss", "grad_norm"] data += [logs["entropy"], logs["value"], success_per_episode["mean"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"]] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | G {} | rR:uomM {:.2f} {:.2f} {:.2f} {:.2f} | F:uomM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | S {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}"
def run(full_args: Namespace) -> None: # import torch.multiprocessing as mp # mp.set_start_method('spawn') args = full_args.main agent_args = full_args.agent model_args = full_args.model env_args = full_args.env_cfg extra_logs = getattr(full_args, "extra_logs", None) if args.seed == 0: args.seed = full_args.run_id + 1 max_eprews = args.max_eprews post_process_args(agent_args) post_process_args(model_args) model_dir = getattr(args, "model_dir", full_args.out_dir) print(model_dir) # ============================================================================================== # @ torc_rl repo original # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) tb_writer = None if args.tb: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(model_dir) # Log command and all script arguments logger.info("{}\n".format(" ".join(sys.argv))) logger.info("{}\n".format(args)) # ============================================================================================== # Set seed for all randomness sources utils.seed(args.seed) # ============================================================================================== # Generate environments envs = [] # Get environment wrapper wrapper_method = getattr(full_args.env_cfg, "wrapper", None) if wrapper_method is None: def idem(x): return x env_wrapper = idem else: env_wrappers = [getattr(environment, w_p) for w_p in wrapper_method] def env_wrapp(w_env): for wrapper in env_wrappers[::-1]: w_env = wrapper(w_env) return w_env env_wrapper = env_wrapp actual_procs = getattr(args, "actual_procs", None) master_make_envs = getattr(full_args.env_cfg, "master_make_envs", False) if actual_procs: # Split envs in chunks no_envs = args.procs envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, master_make=master_make_envs) first_env = envs[0][0] print( f"NO of envs / proc: {chunk_size}; No of processes {len(envs[1:])} + Master" ) else: for i in range(args.procs): env = env_wrapper(gym.make(args.env)) env.max_steps = full_args.env_cfg.max_episode_steps env.no_stacked_frames = full_args.env_cfg.no_stacked_frames env.seed(args.seed + 10000 * i) envs.append(env) first_env = envs[0] # Generate evaluation envs eval_envs = [] if full_args.env_cfg.no_eval_envs > 0: no_envs = full_args.env_cfg.no_eval_envs eval_envs, chunk_size = get_envs(full_args, env_wrapper, no_envs, master_make=master_make_envs) # Define obss preprocessor max_image_value = full_args.env_cfg.max_image_value normalize_img = full_args.env_cfg.normalize obs_space, preprocess_obss = utils.get_obss_preprocessor( args.env, first_env.observation_space, model_dir, max_image_value=max_image_value, normalize=normalize_img) # ============================================================================================== # Load training status try: status = utils.load_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} saver = utils.SaveData(model_dir, save_best=args.save_best, save_all=args.save_all) model, agent_data, other_data = None, dict(), None try: # Continue from last point model, agent_data, other_data = saver.load_training_data(best=False) logger.info("Training data exists & loaded successfully\n") except OSError: logger.info("Could not load training data\n") # ============================================================================================== # Load Model if model is None: model = get_model(model_args, obs_space, first_env.action_space, use_memory=model_args.use_memory, no_stacked_frames=env_args.no_stacked_frames) logger.info(f"Model [{model_args.name}] successfully created\n") # Print Model info logger.info("{}\n".format(model)) if torch.cuda.is_available(): model.cuda() logger.info("CUDA available: {}\n".format(torch.cuda.is_available())) # ============================================================================================== # Load Agent algo = get_agent(full_args.agent, envs, model, agent_data, preprocess_obss=preprocess_obss, reshape_reward=None, eval_envs=eval_envs) has_evaluator = hasattr(algo, "evaluate") and full_args.env_cfg.no_eval_envs > 0 # ============================================================================================== # Train model crt_eprew = 0 if "eprew" in other_data: crt_eprew = other_data["eprew"] num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] update_start_time = time.time() while num_frames < args.frames: # Update model parameters logs = algo.update_parameters() num_frames += logs["num_frames"] update += 1 if has_evaluator: if update % args.eval_interval == 0: algo.evaluate() prev_start_time = update_start_time update_start_time = time.time() # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_start_time - prev_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += ["entropy", "value", "policy_loss", "value_loss"] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"] ] header += ["grad_norm"] data += [logs["grad_norm"]] # add log fields that are not in the standard log format (for example value_int) extra_fields = extra_log_fields(header, list(logs.keys())) header.extend(extra_fields) data += [logs[field] for field in extra_fields] # print to stdout the standard log fields + fields required in config keys_format, printable_data = print_keys(header, data, extra_logs) logger.info(keys_format.format(*printable_data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() if args.tb: for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) status = {"num_frames": num_frames, "update": update} crt_eprew = list(rreturn_per_episode.values())[0] # -- Save vocabulary and model if args.save_interval > 0 and update % args.save_interval == 0: # preprocess_obss.vocab.save() saver.save_training_data(model, algo.get_save_data(), crt_eprew) logger.info("Model successfully saved") utils.save_status(status, model_dir) if crt_eprew > max_eprews != 0: print("Reached max return 0.93") exit()
update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) txt_logger.info("U {} | F {} | FPS {:04.0f} | D {}".format( update, num_frames, fps, duration)) header = [] data = [] if args.env is not None: header += ["perf"] data += [utils.synthesize(logs["return_per_episode"])["mean"]] elif args.curriculum is not None: for i, env_id in enumerate(env_ids): header += ["proba/{}".format(env_id)] data += [penv_head.dist[i]] header += ["perf/{}".format(env_id)] data += [None] if i in penv_head.synthesized_returns.keys(): data[-1] = penv_head.synthesized_returns[i] if args.acp in ["LP", "MR"]: header += ["lp/{}".format(env_id)] data += [compute_dist.compute_att.lps[i]] header += ["attention/{}".format(env_id)] data += [compute_dist.compute_att.atts[i]] if args.acp in ["MR"]: header += ["max_perf/{}".format(env_id)]
def start(model, seed, episodes, size): env_name = "MiniGrid-DoorKey-" + str(size) + "x" + str(size) + "-v0" utils.seed(seed) procs = 10 argmax = False all_data = np.zeros(shape=(size, 8)) print("Evaluating storage/" + model) for _wall in range(2, size - 2): # Generate environment envs = [] for i in range(procs): env = gym.make(env_name) env.setWallID(_wall) envs.append(env) env = ParallelEnv(envs) # Define agent save_dir = utils.get_save_dir(model) agent = utils.Agent(save_dir, env.observation_space, argmax, procs) # print("CUDA available: {}\n".format(torch.cuda.is_available())) # Initialize logs logs = {"num_frames_per_episode": [], "return_per_episode": []} # Run the agent start_time = time.time() obss = env.reset() log_done_counter = 0 log_episode_return = torch.zeros(procs, device=agent.device) log_episode_num_frames = torch.zeros(procs, device=agent.device) while log_done_counter < episodes: actions = agent.get_actions(obss) obss, rewards, dones, _ = env.step(actions) agent.analyze_feedbacks(rewards, dones) log_episode_return += torch.tensor(rewards, device=agent.device, dtype=torch.float) log_episode_num_frames += torch.ones(procs, device=agent.device) for i, done in enumerate(dones): if done: log_done_counter += 1 logs["return_per_episode"].append( log_episode_return[i].item()) logs["num_frames_per_episode"].append( log_episode_num_frames[i].item()) mask = 1 - torch.tensor( dones, device=agent.device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) duration = int(end_time - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) print( "Wall {:3d} | F {:6.0f} | FPS {:4.0f} | D {:3d} | R:x̄σmM {:.2f} {:.2f} {:.2f} {:.2f} | F:x̄σmM {:6.1f} {:6.1f} {:6.1f} {:6.1f}" .format(_wall, num_frames, fps, duration, *return_per_episode.values(), *num_frames_per_episode.values())) all_data[_wall, 0] = return_per_episode["mean"] all_data[_wall, 1] = return_per_episode["std"] all_data[_wall, 2] = return_per_episode["min"] all_data[_wall, 3] = return_per_episode["max"] all_data[_wall, 4] = num_frames_per_episode["mean"] all_data[_wall, 5] = num_frames_per_episode["std"] all_data[_wall, 6] = num_frames_per_episode["min"] all_data[_wall, 7] = num_frames_per_episode["max"] return all_data
args.batch_size, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) best_model = base_model.state_dict() while num_frames < args.frames: # Update model parameters update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() num_frames += logs["num_frames"] update += 1 if utils.synthesize(logs["return_per_episode"])['mean'] > best_val: best_model = base_model.state_dict() # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration", "difficulty"] data = [update, num_frames, fps, duration, difficulty] header += ["return_" + key for key in rreturn_per_episode.keys()]
def tuner(icm_lr, reward_weighting, normalise_rewards, args): import argparse import datetime import torch import torch_ac import tensorboardX import sys import numpy as np from model import ACModel from .a2c import A2CAlgo # from .ppo import PPOAlgo frames_to_visualise = 200 # Parse arguments args.mem = args.recurrence > 1 def make_exploration_heatmap(args, plot_title): import numpy as np import matplotlib.pyplot as plt visitation_counts = np.load( f"{args.model}_visitation_counts.npy", allow_pickle=True ) plot_title = str(np.count_nonzero(visitation_counts)) + args.model plt.imshow(np.log(visitation_counts)) plt.colorbar() plt.title(plot_title) plt.savefig(f"{plot_title}_visitation_counts.png") # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = "cpu" # torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(16): an_env = utils.make_env( args.env, int(args.frames_before_reset), int(args.environment_seed) ) envs.append(an_env) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor(envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo # adapted from impact driven RL from .models import AutoencoderWithUncertainty autoencoder = AutoencoderWithUncertainty(observation_shape=(7, 7, 3)).to(device) autoencoder_opt = torch.optim.Adam( autoencoder.parameters(), lr=icm_lr, weight_decay=0 ) if args.algo == "a2c": algo = A2CAlgo( envs, acmodel, autoencoder, autoencoder_opt, args.uncertainty, args.noisy_tv, args.curiosity, args.randomise_env, args.uncertainty_budget, args.environment_seed, reward_weighting, normalise_rewards, args.frames_before_reset, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss, None, args.random_action, ) elif args.algo == "ppo": algo = PPOAlgo( envs, acmodel, autoencoder, autoencoder_opt, args.uncertainty, args.noisy_tv, args.curiosity, args.randomise_env, args.uncertainty_budget, args.environment_seed, reward_weighting, normalise_rewards, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss, ) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() while num_frames < args.frames: # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() num_frames += logs["num_frames"] update += 1 log_to_wandb(logs, start_time, update_start_time, update_end_time) # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += ["num_frames_" + key for key in num_frames_per_episode.keys()] data += num_frames_per_episode.values() header += [ "intrinsic_rewards", "uncertainties", "novel_states_visited", "entropy", "value", "policy_loss", "value_loss", "grad_norm", ] data += [ logs["intrinsic_rewards"].mean().item(), logs["uncertainties"].mean().item(), logs["novel_states_visited"].mean().item(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"], ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f}".format( *data ) ) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict(), } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) return
def main(env_name, seed, meta, load_id, procs, fullObs, POfullObs, frames, log_interval, save_interval, experimental, _run): """Main function. Called by sacred with arguments filled in from default.yaml or command line. """ # Make a bunch of experimental options available everywhere for easy change for cfg in experimental: setattr(exp_config, cfg, experimental[cfg]) cuda = torch.cuda.is_available() device = torch.device("cuda" if cuda else "cpu") model_name = meta['label'] + "_{}".format(_run._id) model_dir = utils.get_model_dir(model_name) # Define logger, CSV writer and Tensorboard writer logger = utils.get_logger(model_dir) csv_file, csv_writer = utils.get_csv_writer(model_dir) # Log command and all script arguments logger.info("{}\n".format(" ".join(sys.argv))) # Set seed for all randomness sources utils.seed(seed) # Generate environments envs = [] for i in range(procs): env = gym.make(env_name) env.seed(seed + 10000 * i) if fullObs: env = gym_minigrid.wrappers.FullyObsWrapper(env) elif POfullObs: env = gym_minigrid.wrappers.PartialObsFullGridWrapper(env) envs.append(env) # Define obss preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( env_name, envs[0].observation_space, model_dir) # Load training status if load_id is not None: model1, model2, status = utils.load_status_and_model_from_db( db_uri, db_name, model_dir, load_id) if model1 is not None: model1 = model1.to(device) model2 = model2.to(device) acmodels = model1, model2 current_cycle_count, _ = scheduling(status['num_frames']) logger.info("Model successfully loaded\n") logger.info("Loaded status: {}".format(status)) else: # First one is pi_old, second one is pi_train acmodels = [None, create_model(obs_space, envs)] status = {"num_frames": 0, "update": 0} current_cycle_count = 0 logger.info("Model successfully created\n") logger.info("{}\n".format(acmodels[0])) logger.info("Used device: {}\n".format(device)) # Define actor-critic algo algo = create_algo(envs, *acmodels, preprocess_obss) # Train model num_frames = status["num_frames"] total_start_time = time.time() update = status["update"] # current_cycle_count = 0 while num_frames < frames: # Update model parameters cycle_count, alpha = scheduling(num_frames) if cycle_count != current_cycle_count: current_cycle_count = cycle_count switch_training_model(algo, obs_space, envs) logger.info("Switched training model") update_start_time = time.time() logs = algo.update_parameters(alpha) update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value_train", "value_old", "policy_loss_train", "policy_loss_old", "value_loss_train", "value_loss_old" ] data += [ logs["entropy"], logs["value_train"], logs["value_old"], logs["policy_loss_train"], logs["policy_loss_old"], logs["value_loss_train"], logs["value_loss_old"] ] header += [ "grad_norm_train", "grad_norm_old", "alpha", "reg_loss_policy", "reg_loss_value" ] data += [ logs["grad_norm_train"], logs["grad_norm_old"], alpha, logs["reg_loss_policy"], logs["reg_loss_value"] ] logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V:to {:.3f} {:.3f} " .format(*data[:15])) logger.info( "pL:to {:.3f} {:.3f} | vL:to {:.3f} {:.3f} | ∇:to {:.3f} {:.3f} | alpha {:.2f} | rLpv {:.3f} {:.3f}\n" .format(*data[15:])) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_writer.writerow(header) csv_writer.writerow(data) csv_file.flush() for head, dat in zip(header, data): _run.log_scalar(head, dat, num_frames) status = {"num_frames": num_frames, "update": update} # Save vocabulary and model if save_interval > 0 and update % save_interval == 0: preprocess_obss.vocab.save() utils.save_model(algo.pi_old, algo.pi_train, model_dir) logger.info("Model successfully saved") utils.save_status(status, model_dir) utils.save_model_to_db(algo.pi_old, algo.pi_train, model_dir, num_frames, _run) utils.save_status_to_db({ "num_frames": num_frames, "update": update }, model_dir, num_frames, _run)
def main(): # Parse arguments parser = argparse.ArgumentParser() ## General parameters parser.add_argument( "--algo", required=True, help="algorithm to use: a2c | ppo | ppo_intrinsic (REQUIRED)") parser.add_argument("--env", required=True, help="name of the environment to train on (REQUIRED)") parser.add_argument( "--model", default=None, help="name of the model (default: {ENV}_{ALGO}_{TIME})") parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") parser.add_argument("--log-interval", type=int, default=1, help="number of updates between two logs (default: 1)") parser.add_argument( "--save-interval", type=int, default=10, help= "number of updates between two saves (default: 10, 0 means no saving)") parser.add_argument("--procs", type=int, default=16, help="number of processes (default: 16)") parser.add_argument("--frames", type=int, default=10**7, help="number of frames of training (default: 1e7)") ## Parameters for main algorithm parser.add_argument("--epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument("--batch-size", type=int, default=256, help="batch size for PPO (default: 256)") parser.add_argument( "--frames-per-proc", type=int, default=None, help= "number of frames per process before update (default: 5 for A2C and 128 for PPO)" ) parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--lr", type=float, default=0.001, help="learning rate (default: 0.001)") parser.add_argument( "--gae-lambda", type=float, default=0.95, help="lambda coefficient in GAE formula (default: 0.95, 1 means no gae)" ) parser.add_argument("--entropy-coef", type=float, default=0.01, help="entropy term coefficient (default: 0.01)") parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument( "--optim-eps", type=float, default=1e-8, help="Adam and RMSprop optimizer epsilon (default: 1e-8)") parser.add_argument("--optim-alpha", type=float, default=0.99, help="RMSprop optimizer alpha (default: 0.99)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument( "--recurrence", type=int, default=1, help= "number of time-steps gradient is backpropagated (default: 1). If > 1, a LSTM is added to the model to have memory." ) parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model to handle text input") parser.add_argument("--visualize", default=False, help="show real time CNN layer weight changes") args = parser.parse_args() args.mem = args.recurrence > 1 # Set run dir date = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") default_model_name = f"{args.env}_{args.algo}_seed{args.seed}_{date}" model_name = args.model or default_model_name model_dir = utils.get_model_dir(model_name) # Load loggers and Tensorboard writer txt_logger = utils.get_txt_logger(model_dir) csv_file, csv_logger = utils.get_csv_logger(model_dir) tb_writer = tensorboardX.SummaryWriter(model_dir) # Log command and all script arguments txt_logger.info("{}\n".format(" ".join(sys.argv))) txt_logger.info("{}\n".format(args)) # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") txt_logger.info(f"Device: {device}\n") # Load environments envs = [] for i in range(args.procs): envs.append(utils.make_env(args.env, args.seed + 10000 * i)) txt_logger.info("Environments loaded\n") # Load training status try: status = utils.get_status(model_dir) except OSError: status = {"num_frames": 0, "update": 0} txt_logger.info("Training status loaded\n") # Load observations preprocessor obs_space, preprocess_obss = utils.get_obss_preprocessor( envs[0].observation_space) if "vocab" in status: preprocess_obss.vocab.load_vocab(status["vocab"]) txt_logger.info("Observations preprocessor loaded") # Load model acmodel = ACModel(obs_space, envs[0].action_space, args.mem, args.text) if "model_state" in status: acmodel.load_state_dict(status["model_state"]) acmodel.to(device) txt_logger.info("Model loaded\n") txt_logger.info("{}\n".format(acmodel)) # Load algo if args.algo == "a2c": algo = torch_ac.A2CAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) elif args.algo == "ppo": algo = torch_ac.PPOAlgo(envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "ppo_intrinsic": algo = torch_ac.PPOAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.epochs, args.batch_size, preprocess_obss) elif args.algo == "a2c_intrinsic": algo = torch_ac.A2CAlgoIntrinsic( envs, acmodel, device, args.frames_per_proc, args.discount, args.lr, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_alpha, args.optim_eps, preprocess_obss) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) if "optimizer_state" in status: algo.optimizer.load_state_dict(status["optimizer_state"]) txt_logger.info("Optimizer loaded\n") # Train model num_frames = status["num_frames"] update = status["update"] start_time = time.time() print_visual = args.visualize if print_visual: fig, axs = plt.subplots(1, 3) fig.suptitle('Convolution Layer Weights Normalized Difference') while num_frames < args.frames: # Store copies of s_t model params old_parameters = {} for name, param in acmodel.named_parameters(): old_parameters[name] = param.detach().numpy().copy() # Update model parameters update_start_time = time.time() exps, logs1 = algo.collect_experiences() logs2 = algo.update_parameters(exps) logs = {**logs1, **logs2} update_end_time = time.time() # Store copies of s_t+1 model params new_parameters = {} for name, param in acmodel.named_parameters(): new_parameters[name] = param.detach().numpy().copy() # Compute L2 Norm of model state differences # Print model weight change visualization for index in range(len(old_parameters.keys())): if index == 0 or index == 2 or index == 4: key = list(old_parameters.keys())[index] old_weights = old_parameters[key] new_weights = new_parameters[key] norm_diff = numpy.linalg.norm(new_weights - old_weights) diff_matrix = abs(new_weights - old_weights) diff_matrix[:, :, 0, 0] = normalize(diff_matrix[:, :, 0, 0], norm='max', axis=0) if print_visual: axs[int(index / 2)].imshow(diff_matrix[:, :, 0, 0], cmap='Greens', interpolation='nearest') # This allows the plots to update as the model trains if print_visual: plt.ion() plt.show() plt.pause(0.001) num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | ∇ {:.3f}" .format(*data)) header += ["return_" + key for key in return_per_episode.keys()] data += return_per_episode.values() if status["num_frames"] == 0: csv_logger.writerow(header) csv_logger.writerow(data) csv_file.flush() for field, value in zip(header, data): tb_writer.add_scalar(field, value, num_frames) # Save status if args.save_interval > 0 and update % args.save_interval == 0: status = { "num_frames": num_frames, "update": update, "model_state": acmodel.state_dict(), "optimizer_state": algo.optimizer.state_dict() } if hasattr(preprocess_obss, "vocab"): status["vocab"] = preprocess_obss.vocab.vocab utils.save_status(status, model_dir) txt_logger.info("Status saved")
def learn(self, total_timesteps, log_interval=1, save_interval=10, save_env_info=False, save_loc=None): """ The primary training loop. :param total_timesteps: the total number of timesteps :param log_interval: the period between logging/printing updates :param save_interval: the number of updates between model saving :param save_env_info: if we save the environment info (termination set) VERY SLOW :return: True, if training is successful """ self.init_training_algo( ) # initialize the training algo/environment list/optimizer if save_loc: print( 'ignoring save_loc override. if this is not intended, fix me') # initialize parameters self.num_frames = self.status["num_frames"] self.update = self.status["update"] start_time = time.time() # loop until we reach the desired number of timesteps while self.num_frames < total_timesteps: # Update model parameters update_start_time = time.time( ) # store the time (for fps calculations) exps, logs1 = self.algo.collect_experiences( ) # collect a number of data points for training logs2 = self.algo.update_parameters( exps) # update the parameters based on the experiences logs = {**logs1, **logs2} # merge the logs for printing update_end_time = time.time() self.num_frames += logs["num_frames"] self.update += 1 # all of this messy stuff is just storing and printing the log info if self.update % log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize( logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [self.update, self.num_frames, fps, duration] header += [ "rreturn_" + key for key in rreturn_per_episode.keys() ] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm" ] data += [ logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["grad_norm"] ] self.txt_logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:usmM {:.2f} {:.2f} {:.2f} {:.2f} | F:usmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {:.3f} | vL {:.3f} | D {:.3f}" .format(*data)) header += [ "return_" + key for key in return_per_episode.keys() ] data += return_per_episode.values() if self.status["num_frames"] == 0: self.csv_logger.writerow(header) self.csv_logger.writerow(data) self.csv_file.flush() for field, value in zip(header, data): self.tb_writer.add_scalar(field, value, self.num_frames) # Save status if save_interval > 0 and self.update % save_interval == 0: self._save_training_info() if save_env_info: for e in self.training_envs: if hasattr(e, 'save_env_info'): e.save_env_info() self._clear_training_envs() return True
while num_frames < args.frames: # Update model parameters update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() num_frames += logs["num_frames"] update += 1 # Print logs if update % args.log_interval == 0: fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - total_start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) header = ["update", "frames", "FPS", "duration"] data = [update, num_frames, fps, duration] header += ["rreturn_" + key for key in rreturn_per_episode.keys()] data += rreturn_per_episode.values() header += [ "num_frames_" + key for key in num_frames_per_episode.keys() ] data += num_frames_per_episode.values() header += [ "entropy", "value", "policy_loss", "value_loss", "grad_norm"
args.seed, args.testepisodes * 4, txt_logger, gifName="testing", save=False, dir=args.dir) txt_logger.info( ("testTestReward", testTestReward, "testTestPerformance", testTestPerformance, "testTestPerformanceFull", testTestPerformanceFull)) log_update += 1 fps = logs["num_frames"] / (update_end_time - update_start_time) duration = int(time.time() - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize( logs["reshaped_return_per_episode"]) performance_per_episode = utils.synthesize( logs["performance_per_episode"]) rperformance_per_episode = utils.synthesize( logs["reshaped_performance_per_episode"]) buttons_per_episode = utils.synthesize(logs["buttons_per_episode"]) reshaped_buttons_per_episode = utils.synthesize( logs["reshaped_buttons_per_episode"]) phones_per_episode = utils.synthesize(logs["phones_per_episode"]) reshaped_phones_per_episode = utils.synthesize( logs["reshaped_phones_per_episode"])
def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--env", required=True, help="name of the environment (REQUIRED)") parser.add_argument("--model", required=True, help="name of the trained model (REQUIRED)") parser.add_argument("--episodes", type=int, default=100, help="number of episodes of evaluation (default: 100)") parser.add_argument("--seed", type=int, default=0, help="random seed (default: 0)") parser.add_argument("--procs", type=int, default=1, help="number of processes (default: 16)") parser.add_argument("--argmax", action="store_true", default=False, help="action with highest probability is selected") parser.add_argument("--worst-episodes-to-show", type=int, default=10, help="how many worst episodes to show") parser.add_argument("--memory", action="store_true", default=False, help="add a LSTM to the model") parser.add_argument("--text", action="store_true", default=False, help="add a GRU to the model") parser.add_argument("--visualize", default=False, help="print stuff") parser.add_argument("--save_path", default="test_image", help="save path for agent visualizations") args = parser.parse_args() # Set seed for all randomness sources utils.seed(args.seed) # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device: {device}\n") # Load environments envs = [] for i in range(args.procs): env = utils.make_env(args.env, args.seed + 10000 * i) envs.append(env) env = ParallelEnv(envs) print("Environments loaded\n") # Load agent model_dir = utils.get_model_dir(args.model) agent = utils.Agent(env.observation_space, env.action_space, model_dir, device=device, argmax=args.argmax, num_envs=args.procs, use_memory=args.memory, use_text=args.text) print("Agent loaded\n") # Initialize logs logs = {"num_frames_per_episode": [], "return_per_episode": []} # Run agent start_time = time.time() obss = env.reset() log_done_counter = 0 log_episode_return = torch.zeros(args.procs, device=device) log_episode_num_frames = torch.zeros(args.procs, device=device) img_sum = [] obss_sum = None encoding_sum = None img_count = 0 while log_done_counter < args.episodes: actions = agent.get_actions(obss) obss, rewards, dones, _ = env.step(actions) agent.analyze_feedbacks(rewards, dones) log_episode_return += torch.tensor(rewards, device=device, dtype=torch.float) log_episode_num_frames += torch.ones(args.procs, device=device) state = env.get_environment_state() img = state.grid.render(32, state.agent_pos, state.agent_dir, highlight_mask=None) encoding = state.grid.encode() # img_count += 1 # if img_count == 1: # img_sum = img ## obss_sum = obss[0]['image'] ## encoding_sum = encoding # else: # img_sum += img ## obss_sum += obss[0]['image'] ## encoding_sum += encoding for i, done in enumerate(dones): if done: log_done_counter += 1 logs["return_per_episode"].append(log_episode_return[i].item()) logs["num_frames_per_episode"].append( log_episode_num_frames[i].item()) if args.visualize: if len(img_sum) > 0: img_sum = img_sum / img_count # img_sum = img_sum.astype(numpy.uint8) filepath = args.save_path + '_image_' + str( log_done_counter - 1) + '.jpg' imsave(filepath, img_sum) img_sum = [] img_count = 0 else: img_count += 1 if img_count == 1: img_sum = img #.astype(float) else: img_sum += img mask = 1 - torch.tensor(dones, device=device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) duration = int(end_time - start_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) print( "F {} | FPS {:.0f} | D {} | R:μσmM {:.2f} {:.2f} {:.2f} {:.2f} | F:μσmM {:.1f} {:.1f} {} {}" .format(num_frames, fps, duration, *return_per_episode.values(), *num_frames_per_episode.values())) # Print worst episodes n = args.worst_episodes_to_show if n > 0: print("\n{} worst episodes:".format(n)) indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k]) for i in indexes[:n]: print("- episode {}: R={}, F={}".format( i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i]))
for i, done in enumerate(dones): if done: log_done_counter += 1 logs["return_per_episode"].append(log_episode_return[i].item()) logs["num_frames_per_episode"].append( log_episode_num_frames[i].item()) mask = 1 - torch.tensor(dones, device=device, dtype=torch.float) log_episode_return *= mask log_episode_num_frames *= mask end_time = time.time() # Record values of interest for comparison num_frames_seed = sum(logs["num_frames_per_episode"]) return_per_episode_seed = utils.synthesize(logs["return_per_episode"]) num_frames.append(num_frames_seed) returns_per_episode.append(return_per_episode_seed["mean"]) # Clear envs env = None envs = None # Print things print("returns_per_episode (mean): ", np.mean(returns_per_episode)) print("num_frames (mean): ", np.mean(num_frames)) print(" ") print("returns_per_episode (all seeds): ", returns_per_episode) print("num_frames (all seeds): ", num_frames)