def test_train_save_load() -> None: """ Runs training and compares reward curve against saved baseline for an environment with a discrete action space, running a single process, when training is resumed from a saved checkpoint. """ # Check that desired results name is available. save_name = "test_train_save_load" check_results_name(save_name) # Load default training config and run training for the first time. with open(CARTPOLE_CONFIG_PATH, "r") as config_file: config = json.load(config_file) # Modify default training config. config["save_name"] = save_name # Run training to get checkpoint. train(config) # Modify config for second training run. config["load_from"] = save_name config["save_name"] = None config["baseline_metrics_filename"] = "cartpole_save_load" # Run resumed training. train(config) # Clean up. os.system("rm -rf %s" % save_dir_from_name(save_name))
def check_name_uniqueness( base_name: str, search_type: str, iterations: int, trials_per_config: int, start_pos: Dict[str, int] = None, exempt_base: bool = False, num_param_values: List[int] = None, ) -> None: """ Check to make sure that there are no other saved experiments whose names coincide with the current name. This is just to make sure that the saved results don't get mixed up, with some trials being saved with a modified name to ensure uniqueness. """ # Build list of names to check. names_to_check = get_experiment_names( base_name, search_type, iterations, trials_per_config, start_pos, num_param_values, ) # Check names. for name in names_to_check: if exempt_base and name == base_name: continue if os.path.isdir(save_dir_from_name(name)): raise ValueError( "Saved result '%s' already exists. Results of hyperparameter searches" " must have unique names." % name)
def check_results_name(save_name: str) -> None: """ Helper function to check if a results folder already exists, and raise an error if so. """ results_dir = save_dir_from_name(save_name) if os.path.isdir(results_dir): raise ValueError( "Already exists saved results with name %s. This folder must be renamed " "or deleted in order for the test to run properly." % save_name)
def test_save_load_multi() -> None: """ Test saving/loading functionality for training when multiprocessing. """ # Check that desired results name is available. save_name = "test_save_load_multi" check_results_name(save_name) # Load default training config. with open(CARTPOLE_CONFIG_PATH, "r") as config_file: config = json.load(config_file) # Modify default training config and run training to save checkpoint. config["save_name"] = save_name config["num_updates"] = int(config["num_updates"] / MP_FACTOR) config["num_processes"] *= MP_FACTOR checkpoint = train(config) first_metrics = checkpoint["metrics"].state() # Run training for the second time, and load from checkpoint. config["load_from"] = save_name config["save_name"] = None config["num_updates"] *= 2 checkpoint = train(config) second_metrics = checkpoint["metrics"].state() # Compare metrics. assert list(first_metrics.keys()) == list(second_metrics.keys()) for metric_name in first_metrics.keys(): first_metric = first_metrics[metric_name] second_metric = second_metrics[metric_name] assert first_metric["maximum"] <= second_metric["maximum"] for key in ["history", "mean", "stdev"]: n = len(first_metric[key]) assert first_metric[key][:n] == second_metric[key][:n] # Clean up. os.system("rm -rf %s" % save_dir_from_name(save_name))
def resume_template( save_name: str, config_path: str, early_stops: List[Dict[str, int]], baseline_name: str, results_name: str, ) -> None: """ Runs while stopping to save/load at a given set of checkpoints, then compares results against non-interrupted version. """ # Load hyperparameter search config. with open(config_path, "r") as config_file: config = json.load(config_file) config["base_train_config"]["save_name"] = save_name # Set baseline to compare against throughout training. config["base_train_config"]["baseline_metrics_filename"] = baseline_name # Ensure that there are no existing saved experiments whose names coincide with the # experiment names used here. We do have to save and load from disk so we want to # make sure that we aren't overwriting any previously existing files. num_param_values = None if config["search_type"] == "IC_grid": num_param_values = get_num_param_values(config["search_params"]) iterations = get_iterations(config["search_type"], config["search_iterations"], config["search_params"]) check_name_uniqueness( save_name, config["search_type"], iterations, config["trials_per_config"], num_param_values=num_param_values, ) # Run until hitting each early stopping point. for stop_index in range(len(early_stops)): # Set early stopping point. config["early_stop"] = early_stops[stop_index] # Set loading point, if necessary. if stop_index > 0: config["load_from"] = save_name # Run partial training. tune(config) # Finish training from checkpoint. config["early_stop"] = None config["load_from"] = save_name resumed_results = tune(config) # Compare resumed results with un-interrupted results. if results_name is not None: results_path = os.path.join(METRICS_DIR, "%s.json" % results_name) with open(results_path, "r") as results_file: correct_results = json.load(results_file) assert tune_results_equal(resumed_results, correct_results) # Clean up saved results. experiment_names = get_experiment_names( save_name, config["search_type"], iterations, config["trials_per_config"], num_param_values=num_param_values, ) for name in experiment_names: save_dir = save_dir_from_name(name) if os.path.isdir(save_dir): rmtree(save_dir)
def train(config: Dict[str, Any], policy: PPOPolicy = None) -> Dict[str, Dict[str, Any]]: """ Main function for train.py, runs PPO training using settings from `config`. The expected entries of `config` are documented below. If `policy` is None (the default case), then one will be instantiated using settings from `config`. Returns a dictionary holding values of performance metrics from training and evaluation. Parameters ---------- env_name : str Environment to train on. num_updates : int Number of update steps. rollout_length : int Number of environment steps per rollout. num_ppo_epochs : int Number of ppo epochs per update. num_minibatch : int Number of mini batches per update step for PPO. num_processes : int Number of asynchronous environments to run at once. lr_schedule_type : str Either None, "exponential", "cosine", or "linear". If None is given, the learning rate will stay at initial_lr for the duration of training. initial_lr : float Initial policy learning rate. final_lr : float Final policy learning rate. eps : float Epsilon value for numerical stability. value_loss_coeff : float PPO value loss coefficient. entropy_loss_coeff : float PPO entropy loss coefficient gamma : float Discount factor for rewards. gae_lambda : float Lambda parameter for GAE (used in equation (11) of PPO paper). max_grad_norm : float Max norm of gradients clip_param : float Clipping parameter for PPO surrogate loss. clip_value_loss : False Whether or not to clip the value loss. normalize_advantages : bool Whether or not to normalize advantages after computation. normalize_transition : bool Whether or not to normalize observations and rewards. architecture_config: Dict[str, Any] Config dictionary for the architecture. Should contain an entry for "type", which is either "vanilla", "trunk", "splitting_v1" or "splitting_v2", and all other entries should correspond to the keyword arguments for the corresponding network class, which is either VanillaNetwork, MultiTaskTrunkNetwork, or MultiTaskSplittingNetworkV1. This can also be None in the case that `policy` is not None. cuda : bool Whether or not to train on GPU. seed : int Random seed. print_freq : int Number of training iterations between metric printing. save_freq : int Number of training iterations between saving of intermediate progress. If None, no saving of intermediate progress will occur. Note that if save_name is None, then this value will just be ignored. load_from : str Path of checkpoint file (as saved by this function) to load from in order to resume training. metrics_filename : str Name to save metric values under. baseline_metrics_filename : str Name of metrics baseline file to compare against. save_name : str Name to save experiments under. same_np_seed : bool Whether or not to use the same numpy random seed across each process. This should really only be used when training on MetaWorld, as it allows for multiple processes to generate/act over the same set of goals. save_memory : bool (Optional) Whether or not to save memory when training on a multi-task MetaWorld benchmark by creating a new environment instance at each episode. Only applicable to MetaWorld training. Defaults to False if not included. """ # Construct save directory. if config["save_name"] is not None: # Append "_n" (for the minimal n) to name to ensure that save name is unique, # and create the save directory. original_save_name = config["save_name"] save_dir = save_dir_from_name(config["save_name"]) n = 0 while os.path.isdir(save_dir): n += 1 if n > 1: index_start = config["save_name"].rindex("_") config["save_name"] = config[ "save_name"][:index_start] + "_%d" % n else: config["save_name"] += "_1" save_dir = save_dir_from_name(config["save_name"]) os.makedirs(save_dir) if original_save_name != config["save_name"]: print( "There already exists saved results with name '%s'. Saving current " "results under name '%s'." % (original_save_name, config["save_name"])) # Save config. config_path = os.path.join(save_dir, "%s_config.json" % config["save_name"]) with open(config_path, "w") as config_file: json.dump(config, config_file, indent=4) # Set logger path. log_path = os.path.join(save_dir, "%s_log.txt" % config["save_name"]) logger.log_path = log_path os.mknod(log_path) # Try to save repo git hash. This will only work when running training from # inside the repository. try: version_path = os.path.join(save_dir, "VERSION") os.system("git rev-parse HEAD > %s" % version_path) except: pass # Set random seed, number of threads, and device. np.random.seed(config["seed"]) torch.manual_seed(config["seed"]) torch.cuda.manual_seed_all(config["seed"]) torch.set_num_threads(1) if config["cuda"]: if torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") print( 'Warning: config["cuda"] = True but torch.cuda.is_available() = ' "False. Using CPU for training.") else: device = torch.device("cpu") # Set environment and policy. num_tasks = get_num_tasks(config["env_name"]) kwargs = {} if "save_memory" in config: kwargs["save_memory"] = config["save_memory"] env = get_env( config["env_name"], config["num_processes"], config["seed"], config["time_limit"], config["normalize_transition"], config["normalize_first_n"], allow_early_resets=True, same_np_seed=config["same_np_seed"], **kwargs, ) if policy is None: policy = PPOPolicy( observation_space=env.observation_space, action_space=env.action_space, num_minibatch=config["num_minibatch"], num_processes=config["num_processes"], rollout_length=config["rollout_length"], num_updates=config["num_updates"], architecture_config=config["architecture_config"], num_tasks=num_tasks, num_ppo_epochs=config["num_ppo_epochs"], lr_schedule_type=config["lr_schedule_type"], initial_lr=config["initial_lr"], final_lr=config["final_lr"], eps=config["eps"], value_loss_coeff=config["value_loss_coeff"], entropy_loss_coeff=config["entropy_loss_coeff"], gamma=config["gamma"], gae_lambda=config["gae_lambda"], clip_param=config["clip_param"], max_grad_norm=config["max_grad_norm"], clip_value_loss=config["clip_value_loss"], normalize_advantages=config["normalize_advantages"], device=device, ) # Construct object to store rollout information. rollout = RolloutStorage( rollout_length=config["rollout_length"], observation_space=env.observation_space, action_space=env.action_space, num_processes=config["num_processes"], hidden_state_size=policy.policy_network.recurrent_hidden_size if policy.recurrent else 1, device=device, ) # Initialize environment and set first observation. rollout.set_initial_obs(env.reset()) # Construct metrics object to hold performance metrics. TRAIN_WINDOW = 500 test_window = round(TRAIN_WINDOW / config["evaluation_episodes"]) metrics = Metrics(train_window=TRAIN_WINDOW, test_window=test_window) # Load intermediate progress from checkpoint, if necessary. update_iteration = 0 if config["load_from"] is not None: checkpoint_filename = os.path.join( save_dir_from_name(config["load_from"]), "checkpoint.pkl") with open(checkpoint_filename, "rb") as checkpoint_file: checkpoint = pickle.load(checkpoint_file) # Make sure current config and previous config line up. assert aligned_train_configs(config, checkpoint["config"]) # Load policy, metrics, and update iteration. policy = checkpoint["policy"] metrics = checkpoint["metrics"] update_iteration = checkpoint["update_iteration"] # Training loop. policy.train = True while update_iteration < config["num_updates"]: # Sample rollout. rollout, episode_rewards, episode_successes = collect_rollout( rollout, env, policy) # Compute update. for step_loss in policy.get_loss(rollout): # If we're training a splitting network, pass it the task-specific losses. if policy.policy_network.architecture_type in [ "splitting_v1", "splitting_v2", ]: policy.policy_network.actor.check_for_split(step_loss) policy.policy_network.critic.check_for_split(step_loss) # If we're training a trunk network, check for frequency of conflicting # gradients. if policy.policy_network.architecture_type == "trunk": if policy.policy_network.actor.monitor_grads: policy.policy_network.actor.check_conflicting_grads( step_loss) if policy.policy_network.critic.monitor_grads: policy.policy_network.critic.check_conflicting_grads( step_loss) # If we are multi-task training, consolidate task-losses with weighted sum. if num_tasks > 1: step_loss = torch.sum(step_loss) # Perform backward pass, clip gradient, and take optimizer step. policy.policy_network.zero_grad() step_loss.backward() if config["max_grad_norm"] is not None: nn.utils.clip_grad_norm_(policy.policy_network.parameters(), config["max_grad_norm"]) policy.optimizer.step() policy.after_step() # Reset rollout storage. rollout.reset() # Aggregate metrics and run evaluation, if necessary. step_metrics = {} step_metrics["train_reward"] = episode_rewards step_metrics["train_success"] = episode_successes if (update_iteration % config["evaluation_freq"] == 0 or update_iteration == config["num_updates"] - 1): # Reset environment and rollout, so we don't cross-contaminate episodes from # training and evaluation. rollout.init_rollout_info() rollout.set_initial_obs(env.reset()) # Run evaluation and record metrics. policy.train = False evaluation_rewards, evaluation_successes = evaluate( env, policy, rollout, config["evaluation_episodes"], ) policy.train = True step_metrics["eval_reward"] = evaluation_rewards step_metrics["eval_success"] = evaluation_successes # Reset environment and rollout, as above. rollout.init_rollout_info() rollout.set_initial_obs(env.reset()) # Update and print metrics. metrics.update(step_metrics) if (update_iteration % config["print_freq"] == 0 or update_iteration == config["num_updates"] - 1): message = "Update %d | " % update_iteration message += str(metrics) message += "\t" print(message, end="\r") # This is to ensure that printed out values don't get overwritten after we # finish. if update_iteration == config["num_updates"] - 1: print("") # Save intermediate training progress, if necessary. Note that we save an # incremented version of update_iteration so that the loaded version will take # on the subsequent value of update_iteration on the first step. if config["save_name"] is not None and ( update_iteration == config["num_updates"] - 1 or (config["save_freq"] is not None and update_iteration % config["save_freq"] == 0)): checkpoint = {} checkpoint["policy"] = policy checkpoint["metrics"] = metrics checkpoint["update_iteration"] = update_iteration + 1 checkpoint["config"] = config checkpoint_filename = os.path.join(save_dir, "checkpoint.pkl") with open(checkpoint_filename, "wb") as checkpoint_file: pickle.dump(checkpoint, checkpoint_file) update_iteration += 1 # Close environment. env.close() # Save metrics if necessary. if config["metrics_filename"] is not None: if not os.path.isdir(METRICS_DIR): os.makedirs(METRICS_DIR) metrics_path = os.path.join(METRICS_DIR, "%s.pkl" % config["metrics_filename"]) with open(metrics_path, "wb") as metrics_file: pickle.dump(metrics.history(), metrics_file) # Compare output_metrics to baseline if necessary. if config["baseline_metrics_filename"] is not None: baseline_metrics_path = os.path.join( METRICS_DIR, "%s.pkl" % config["baseline_metrics_filename"]) compare_metrics(metrics.history(), baseline_metrics_path) # Save results if necessary. if config["save_name"] is not None: # Save metrics. metrics_path = os.path.join(save_dir, "%s_metrics.json" % config["save_name"]) with open(metrics_path, "w") as metrics_file: json.dump(metrics.state(), metrics_file, indent=4) # Plot results. plot_path = os.path.join(save_dir, "%s_plot.png" % config["save_name"]) plot(metrics.state(), plot_path) # Construct checkpoint. checkpoint = {} checkpoint["policy"] = policy checkpoint["metrics"] = metrics checkpoint["update_iteration"] = update_iteration + 1 checkpoint["config"] = config return checkpoint
def tune(tune_config: Dict[str, Any]) -> Dict[str, Any]: """ Perform search over hyperparameter configurations. Only argument is ``tune_config``, a dictionary holding settings for training. The expected elements of this dictionary are documented below. This function returns a dictionary holding the results of training and the various parameter configurations used. Parameters ---------- search_type : str Either "random", "grid", or "IC_grid", defines the search strategy to use. search_iterations : int Number of different hyperparameter configurations to try in search sequence. In cases where the number of configurations is determined by ``search_params`` (such as when using grid search), the value of this variable is ignored, and the determined value is used instead. early_stop : Dict[str, int] Options to stop before reaching the end of training. This is mainly for simulating interruptions in test. Should have two keys, "iterations" and "trials", the corresponding value of each denotes how many of each to execute before stopping early. For example, {"iterations": 3", "trials": 1} will execute 3 whole iterations, and 1 trial of the 4th iteration. If early stopping isn't desired, this value can just be set to None. trials_per_config : int Number of training runs to perform for each hyperparameter configuration. The fitness of each training run is averaged to produce an overall fitness for each hyperparameter configuration. base_train_config : Dict[str, Any] Config dictionary for function train() in meta/train.py. This is used as a starting point for hyperparameter search. It is required that each leaf element of this config dictionary have a unique key, i.e. a config containing base_train_config["key1"]["num_layers"] and base_train_config["key2"]["num_layers"] is invalid. This occurrence will cause unexpected behavior due to the implementation of update_config(). search_params : Dict[str, Any] Search specifications for each parameter, such as max/min values, etc. The format of this dictionary varies between different search types. fitness_metric_name : str Name of metric (key in metrics dictionary returned from train()) to use as fitness function for hyperparameter search. Current supported values are "train_reward", "eval_reward", "train_success", "eval_success". fitness_metric_type : str Either "mean" or "maximum", used to determine which value of metric given in tune_config["fitnesss_metric_name"] to use as fitness, either the mean value at the end of training or the maximum value throughout training. seed : int Random seed for hyperparameter search. load_from : str Name of results directory to resume training from. """ # Extract info from config. search_type = tune_config["search_type"] iterations = tune_config["search_iterations"] early_stop = tune_config["early_stop"] trials_per_config = tune_config["trials_per_config"] base_config = tune_config["base_train_config"] search_params = tune_config["search_params"] fitness_metric_name = tune_config["fitness_metric_name"] fitness_metric_type = tune_config["fitness_metric_type"] seed = tune_config["seed"] load_from = tune_config["load_from"] # Compute iterations from tune_config["search_params"] if necessary. When search # type is "grid" or "IC_grid", iterations must be computed from ``search_params``. if search_type in ["grid", "IC_grid"]: iterations = get_iterations(search_type, iterations, search_params) # Load checkpoint, if necessary. if load_from is not None: load_dir = save_dir_from_name(load_from) checkpoint_filename = os.path.join(load_dir, "checkpoint.pkl") with open(checkpoint_filename, "rb") as checkpoint_file: checkpoint = pickle.load(checkpoint_file) # Make sure current config and previous config line up. assert aligned_tune_configs(tune_config, checkpoint["tune_config"]) else: load_dir = None checkpoint = None # Read in base name and make sure it is valid. Naming is slightly different for # different search strategies, so we do some weirdness here to make one function # which handles all cases. If it is valid, we make the save directory and save the # initial config. base_name = base_config["save_name"] if base_name is not None: # Compute previous checkpoint. start_pos = get_start_pos(search_type, checkpoint) # Edge case: If ``load_from == base_name``, then we exempt ``base_name`` from # the uniqueness check. exempt_base = load_from is not None and load_from == base_name # Check uniqueness of each training name. check_args = [ base_name, search_type, iterations, trials_per_config, start_pos, exempt_base, ] if search_type == "IC_grid": num_param_values = get_num_param_values(search_params) check_args.append(num_param_values) check_name_uniqueness(*check_args) # Create save directory, if we aren't loading from an already existing directory # of the same name. save_dir = save_dir_from_name(base_name) if not exempt_base: os.makedirs(save_dir) # Save config. config_path = os.path.join(save_dir, "%s_config.json" % base_name) with open(config_path, "w") as config_file: json.dump(tune_config, config_file, indent=4) else: save_dir = None # Construct fitness function. if fitness_metric_name not in [ "train_reward", "eval_reward", "train_success", "eval_success", ]: raise ValueError("Unsupported metric name: '%s'." % fitness_metric_name) if fitness_metric_type == "mean": fitness_fn = lambda metrics: metrics[fitness_metric_name]["mean"][-1] elif fitness_metric_type == "maximum": fitness_fn = lambda metrics: metrics[fitness_metric_name]["maximum"] else: raise ValueError("Unsupported metric type: '%s'." % fitness_metric_type) # Set random seed. Note that this may cause reproducibility issues since the train() # function now uses the random module. random.seed(seed) # Run the chosen search strategy. if tune_config["search_type"] == "random": search_fn = random_search elif tune_config["search_type"] == "grid": search_fn = grid_search elif tune_config["search_type"] == "IC_grid": search_fn = IC_grid_search results = search_fn( tune_config, base_config, iterations, early_stop, trials_per_config, fitness_fn, search_params, save_dir, checkpoint, ) # Save results and config. if base_name is not None: # Save results. results_path = os.path.join(save_dir, "%s_results.json" % base_name) with open(results_path, "w") as results_file: json.dump(results, results_file, indent=4) return results