Ejemplo n.º 1
0
def test_train_save_load() -> None:
    """
    Runs training and compares reward curve against saved baseline for an environment
    with a discrete action space, running a single process, when training is resumed
    from a saved checkpoint.
    """

    # Check that desired results name is available.
    save_name = "test_train_save_load"
    check_results_name(save_name)

    # Load default training config and run training for the first time.
    with open(CARTPOLE_CONFIG_PATH, "r") as config_file:
        config = json.load(config_file)

    # Modify default training config.
    config["save_name"] = save_name

    # Run training to get checkpoint.
    train(config)

    # Modify config for second training run.
    config["load_from"] = save_name
    config["save_name"] = None
    config["baseline_metrics_filename"] = "cartpole_save_load"

    # Run resumed training.
    train(config)

    # Clean up.
    os.system("rm -rf %s" % save_dir_from_name(save_name))
Ejemplo n.º 2
0
def check_name_uniqueness(
    base_name: str,
    search_type: str,
    iterations: int,
    trials_per_config: int,
    start_pos: Dict[str, int] = None,
    exempt_base: bool = False,
    num_param_values: List[int] = None,
) -> None:
    """
    Check to make sure that there are no other saved experiments whose names coincide
    with the current name. This is just to make sure that the saved results don't get
    mixed up, with some trials being saved with a modified name to ensure uniqueness.
    """

    # Build list of names to check.
    names_to_check = get_experiment_names(
        base_name,
        search_type,
        iterations,
        trials_per_config,
        start_pos,
        num_param_values,
    )

    # Check names.
    for name in names_to_check:
        if exempt_base and name == base_name:
            continue

        if os.path.isdir(save_dir_from_name(name)):
            raise ValueError(
                "Saved result '%s' already exists. Results of hyperparameter searches"
                " must have unique names." % name)
Ejemplo n.º 3
0
def check_results_name(save_name: str) -> None:
    """
    Helper function to check if a results folder already exists, and raise an error if
    so.
    """

    results_dir = save_dir_from_name(save_name)
    if os.path.isdir(results_dir):
        raise ValueError(
            "Already exists saved results with name %s. This folder must be renamed "
            "or deleted in order for the test to run properly." % save_name)
Ejemplo n.º 4
0
def test_save_load_multi() -> None:
    """
    Test saving/loading functionality for training when multiprocessing.
    """

    # Check that desired results name is available.
    save_name = "test_save_load_multi"
    check_results_name(save_name)

    # Load default training config.
    with open(CARTPOLE_CONFIG_PATH, "r") as config_file:
        config = json.load(config_file)

    # Modify default training config and run training to save checkpoint.
    config["save_name"] = save_name
    config["num_updates"] = int(config["num_updates"] / MP_FACTOR)
    config["num_processes"] *= MP_FACTOR
    checkpoint = train(config)
    first_metrics = checkpoint["metrics"].state()

    # Run training for the second time, and load from checkpoint.
    config["load_from"] = save_name
    config["save_name"] = None
    config["num_updates"] *= 2
    checkpoint = train(config)
    second_metrics = checkpoint["metrics"].state()

    # Compare metrics.
    assert list(first_metrics.keys()) == list(second_metrics.keys())
    for metric_name in first_metrics.keys():
        first_metric = first_metrics[metric_name]
        second_metric = second_metrics[metric_name]

        assert first_metric["maximum"] <= second_metric["maximum"]
        for key in ["history", "mean", "stdev"]:
            n = len(first_metric[key])
            assert first_metric[key][:n] == second_metric[key][:n]

    # Clean up.
    os.system("rm -rf %s" % save_dir_from_name(save_name))
Ejemplo n.º 5
0
def resume_template(
    save_name: str,
    config_path: str,
    early_stops: List[Dict[str, int]],
    baseline_name: str,
    results_name: str,
) -> None:
    """
    Runs while stopping to save/load at a given set of checkpoints, then compares
    results against non-interrupted version.
    """

    # Load hyperparameter search config.
    with open(config_path, "r") as config_file:
        config = json.load(config_file)
    config["base_train_config"]["save_name"] = save_name

    # Set baseline to compare against throughout training.
    config["base_train_config"]["baseline_metrics_filename"] = baseline_name

    # Ensure that there are no existing saved experiments whose names coincide with the
    # experiment names used here. We do have to save and load from disk so we want to
    # make sure that we aren't overwriting any previously existing files.
    num_param_values = None
    if config["search_type"] == "IC_grid":
        num_param_values = get_num_param_values(config["search_params"])
    iterations = get_iterations(config["search_type"],
                                config["search_iterations"],
                                config["search_params"])
    check_name_uniqueness(
        save_name,
        config["search_type"],
        iterations,
        config["trials_per_config"],
        num_param_values=num_param_values,
    )

    # Run until hitting each early stopping point.
    for stop_index in range(len(early_stops)):

        # Set early stopping point.
        config["early_stop"] = early_stops[stop_index]

        # Set loading point, if necessary.
        if stop_index > 0:
            config["load_from"] = save_name

        # Run partial training.
        tune(config)

    # Finish training from checkpoint.
    config["early_stop"] = None
    config["load_from"] = save_name
    resumed_results = tune(config)

    # Compare resumed results with un-interrupted results.
    if results_name is not None:
        results_path = os.path.join(METRICS_DIR, "%s.json" % results_name)
        with open(results_path, "r") as results_file:
            correct_results = json.load(results_file)
        assert tune_results_equal(resumed_results, correct_results)

    # Clean up saved results.
    experiment_names = get_experiment_names(
        save_name,
        config["search_type"],
        iterations,
        config["trials_per_config"],
        num_param_values=num_param_values,
    )
    for name in experiment_names:
        save_dir = save_dir_from_name(name)
        if os.path.isdir(save_dir):
            rmtree(save_dir)
Ejemplo n.º 6
0
def train(config: Dict[str, Any],
          policy: PPOPolicy = None) -> Dict[str, Dict[str, Any]]:
    """
    Main function for train.py, runs PPO training using settings from `config`.  The
    expected entries of `config` are documented below. If `policy` is None (the default
    case), then one will be instantiated using settings from `config`. Returns a
    dictionary holding values of performance metrics from training and evaluation.

    Parameters
    ----------
    env_name : str
        Environment to train on.
    num_updates : int
        Number of update steps.
    rollout_length : int
        Number of environment steps per rollout.
    num_ppo_epochs : int
        Number of ppo epochs per update.
    num_minibatch : int
        Number of mini batches per update step for PPO.
    num_processes : int
        Number of asynchronous environments to run at once.
    lr_schedule_type : str
        Either None, "exponential", "cosine", or "linear". If None is given, the
        learning rate will stay at initial_lr for the duration of training.
    initial_lr : float
        Initial policy learning rate.
    final_lr : float
        Final policy learning rate.
    eps : float
        Epsilon value for numerical stability.
    value_loss_coeff : float
        PPO value loss coefficient.
    entropy_loss_coeff : float
        PPO entropy loss coefficient
    gamma : float
        Discount factor for rewards.
    gae_lambda : float
        Lambda parameter for GAE (used in equation (11) of PPO paper).
    max_grad_norm : float
        Max norm of gradients
    clip_param : float
        Clipping parameter for PPO surrogate loss.
    clip_value_loss : False
        Whether or not to clip the value loss.
    normalize_advantages : bool
        Whether or not to normalize advantages after computation.
    normalize_transition : bool
        Whether or not to normalize observations and rewards.
    architecture_config: Dict[str, Any]
        Config dictionary for the architecture. Should contain an entry for "type",
        which is either "vanilla", "trunk", "splitting_v1" or "splitting_v2", and all
        other entries should correspond to the keyword arguments for the corresponding
        network class, which is either VanillaNetwork, MultiTaskTrunkNetwork, or
        MultiTaskSplittingNetworkV1. This can also be None in the case that `policy` is
        not None.
    cuda : bool
        Whether or not to train on GPU.
    seed : int
        Random seed.
    print_freq : int
        Number of training iterations between metric printing.
    save_freq : int
        Number of training iterations between saving of intermediate progress. If None,
        no saving of intermediate progress will occur. Note that if save_name is None,
        then this value will just be ignored.
    load_from : str
        Path of checkpoint file (as saved by this function) to load from in order to
        resume training.
    metrics_filename : str
        Name to save metric values under.
    baseline_metrics_filename : str
        Name of metrics baseline file to compare against.
    save_name : str
        Name to save experiments under.
    same_np_seed : bool
        Whether or not to use the same numpy random seed across each process. This
        should really only be used when training on MetaWorld, as it allows for multiple
        processes to generate/act over the same set of goals.
    save_memory : bool
        (Optional) Whether or not to save memory when training on a multi-task MetaWorld
        benchmark by creating a new environment instance at each episode. Only
        applicable to MetaWorld training. Defaults to False if not included.
    """

    # Construct save directory.
    if config["save_name"] is not None:

        # Append "_n" (for the minimal n) to name to ensure that save name is unique,
        # and create the save directory.
        original_save_name = config["save_name"]
        save_dir = save_dir_from_name(config["save_name"])
        n = 0
        while os.path.isdir(save_dir):
            n += 1
            if n > 1:
                index_start = config["save_name"].rindex("_")
                config["save_name"] = config[
                    "save_name"][:index_start] + "_%d" % n
            else:
                config["save_name"] += "_1"
            save_dir = save_dir_from_name(config["save_name"])
        os.makedirs(save_dir)
        if original_save_name != config["save_name"]:
            print(
                "There already exists saved results with name '%s'. Saving current "
                "results under name '%s'." %
                (original_save_name, config["save_name"]))

        # Save config.
        config_path = os.path.join(save_dir,
                                   "%s_config.json" % config["save_name"])
        with open(config_path, "w") as config_file:
            json.dump(config, config_file, indent=4)

        # Set logger path.
        log_path = os.path.join(save_dir, "%s_log.txt" % config["save_name"])
        logger.log_path = log_path
        os.mknod(log_path)

        # Try to save repo git hash. This will only work when running training from
        # inside the repository.
        try:
            version_path = os.path.join(save_dir, "VERSION")
            os.system("git rev-parse HEAD > %s" % version_path)
        except:
            pass

    # Set random seed, number of threads, and device.
    np.random.seed(config["seed"])
    torch.manual_seed(config["seed"])
    torch.cuda.manual_seed_all(config["seed"])
    torch.set_num_threads(1)
    if config["cuda"]:
        if torch.cuda.is_available():
            device = torch.device("cuda:0")
        else:
            device = torch.device("cpu")
            print(
                'Warning: config["cuda"] = True but torch.cuda.is_available() = '
                "False. Using CPU for training.")
    else:
        device = torch.device("cpu")

    # Set environment and policy.
    num_tasks = get_num_tasks(config["env_name"])
    kwargs = {}
    if "save_memory" in config:
        kwargs["save_memory"] = config["save_memory"]
    env = get_env(
        config["env_name"],
        config["num_processes"],
        config["seed"],
        config["time_limit"],
        config["normalize_transition"],
        config["normalize_first_n"],
        allow_early_resets=True,
        same_np_seed=config["same_np_seed"],
        **kwargs,
    )
    if policy is None:
        policy = PPOPolicy(
            observation_space=env.observation_space,
            action_space=env.action_space,
            num_minibatch=config["num_minibatch"],
            num_processes=config["num_processes"],
            rollout_length=config["rollout_length"],
            num_updates=config["num_updates"],
            architecture_config=config["architecture_config"],
            num_tasks=num_tasks,
            num_ppo_epochs=config["num_ppo_epochs"],
            lr_schedule_type=config["lr_schedule_type"],
            initial_lr=config["initial_lr"],
            final_lr=config["final_lr"],
            eps=config["eps"],
            value_loss_coeff=config["value_loss_coeff"],
            entropy_loss_coeff=config["entropy_loss_coeff"],
            gamma=config["gamma"],
            gae_lambda=config["gae_lambda"],
            clip_param=config["clip_param"],
            max_grad_norm=config["max_grad_norm"],
            clip_value_loss=config["clip_value_loss"],
            normalize_advantages=config["normalize_advantages"],
            device=device,
        )

    # Construct object to store rollout information.
    rollout = RolloutStorage(
        rollout_length=config["rollout_length"],
        observation_space=env.observation_space,
        action_space=env.action_space,
        num_processes=config["num_processes"],
        hidden_state_size=policy.policy_network.recurrent_hidden_size
        if policy.recurrent else 1,
        device=device,
    )

    # Initialize environment and set first observation.
    rollout.set_initial_obs(env.reset())

    # Construct metrics object to hold performance metrics.
    TRAIN_WINDOW = 500
    test_window = round(TRAIN_WINDOW / config["evaluation_episodes"])
    metrics = Metrics(train_window=TRAIN_WINDOW, test_window=test_window)

    # Load intermediate progress from checkpoint, if necessary.
    update_iteration = 0
    if config["load_from"] is not None:
        checkpoint_filename = os.path.join(
            save_dir_from_name(config["load_from"]), "checkpoint.pkl")
        with open(checkpoint_filename, "rb") as checkpoint_file:
            checkpoint = pickle.load(checkpoint_file)

        # Make sure current config and previous config line up.
        assert aligned_train_configs(config, checkpoint["config"])

        # Load policy, metrics, and update iteration.
        policy = checkpoint["policy"]
        metrics = checkpoint["metrics"]
        update_iteration = checkpoint["update_iteration"]

    # Training loop.
    policy.train = True

    while update_iteration < config["num_updates"]:

        # Sample rollout.
        rollout, episode_rewards, episode_successes = collect_rollout(
            rollout, env, policy)

        # Compute update.
        for step_loss in policy.get_loss(rollout):

            # If we're training a splitting network, pass it the task-specific losses.
            if policy.policy_network.architecture_type in [
                    "splitting_v1",
                    "splitting_v2",
            ]:
                policy.policy_network.actor.check_for_split(step_loss)
                policy.policy_network.critic.check_for_split(step_loss)

            # If we're training a trunk network, check for frequency of conflicting
            # gradients.
            if policy.policy_network.architecture_type == "trunk":
                if policy.policy_network.actor.monitor_grads:
                    policy.policy_network.actor.check_conflicting_grads(
                        step_loss)
                if policy.policy_network.critic.monitor_grads:
                    policy.policy_network.critic.check_conflicting_grads(
                        step_loss)

            # If we are multi-task training, consolidate task-losses with weighted sum.
            if num_tasks > 1:
                step_loss = torch.sum(step_loss)

            # Perform backward pass, clip gradient, and take optimizer step.
            policy.policy_network.zero_grad()
            step_loss.backward()
            if config["max_grad_norm"] is not None:
                nn.utils.clip_grad_norm_(policy.policy_network.parameters(),
                                         config["max_grad_norm"])
            policy.optimizer.step()
        policy.after_step()

        # Reset rollout storage.
        rollout.reset()

        # Aggregate metrics and run evaluation, if necessary.
        step_metrics = {}
        step_metrics["train_reward"] = episode_rewards
        step_metrics["train_success"] = episode_successes
        if (update_iteration % config["evaluation_freq"] == 0
                or update_iteration == config["num_updates"] - 1):
            # Reset environment and rollout, so we don't cross-contaminate episodes from
            # training and evaluation.
            rollout.init_rollout_info()
            rollout.set_initial_obs(env.reset())

            # Run evaluation and record metrics.
            policy.train = False
            evaluation_rewards, evaluation_successes = evaluate(
                env,
                policy,
                rollout,
                config["evaluation_episodes"],
            )
            policy.train = True
            step_metrics["eval_reward"] = evaluation_rewards
            step_metrics["eval_success"] = evaluation_successes

            # Reset environment and rollout, as above.
            rollout.init_rollout_info()
            rollout.set_initial_obs(env.reset())

        # Update and print metrics.
        metrics.update(step_metrics)
        if (update_iteration % config["print_freq"] == 0
                or update_iteration == config["num_updates"] - 1):
            message = "Update %d | " % update_iteration
            message += str(metrics)
            message += "\t"
            print(message, end="\r")

        # This is to ensure that printed out values don't get overwritten after we
        # finish.
        if update_iteration == config["num_updates"] - 1:
            print("")

        # Save intermediate training progress, if necessary. Note that we save an
        # incremented version of update_iteration so that the loaded version will take
        # on the subsequent value of update_iteration on the first step.
        if config["save_name"] is not None and (
                update_iteration == config["num_updates"] - 1 or
            (config["save_freq"] is not None
             and update_iteration % config["save_freq"] == 0)):
            checkpoint = {}
            checkpoint["policy"] = policy
            checkpoint["metrics"] = metrics
            checkpoint["update_iteration"] = update_iteration + 1
            checkpoint["config"] = config

            checkpoint_filename = os.path.join(save_dir, "checkpoint.pkl")
            with open(checkpoint_filename, "wb") as checkpoint_file:
                pickle.dump(checkpoint, checkpoint_file)

        update_iteration += 1

    # Close environment.
    env.close()

    # Save metrics if necessary.
    if config["metrics_filename"] is not None:
        if not os.path.isdir(METRICS_DIR):
            os.makedirs(METRICS_DIR)
        metrics_path = os.path.join(METRICS_DIR,
                                    "%s.pkl" % config["metrics_filename"])
        with open(metrics_path, "wb") as metrics_file:
            pickle.dump(metrics.history(), metrics_file)

    # Compare output_metrics to baseline if necessary.
    if config["baseline_metrics_filename"] is not None:
        baseline_metrics_path = os.path.join(
            METRICS_DIR, "%s.pkl" % config["baseline_metrics_filename"])
        compare_metrics(metrics.history(), baseline_metrics_path)

    # Save results if necessary.
    if config["save_name"] is not None:

        # Save metrics.
        metrics_path = os.path.join(save_dir,
                                    "%s_metrics.json" % config["save_name"])
        with open(metrics_path, "w") as metrics_file:
            json.dump(metrics.state(), metrics_file, indent=4)

        # Plot results.
        plot_path = os.path.join(save_dir, "%s_plot.png" % config["save_name"])
        plot(metrics.state(), plot_path)

    # Construct checkpoint.
    checkpoint = {}
    checkpoint["policy"] = policy
    checkpoint["metrics"] = metrics
    checkpoint["update_iteration"] = update_iteration + 1
    checkpoint["config"] = config

    return checkpoint
Ejemplo n.º 7
0
def tune(tune_config: Dict[str, Any]) -> Dict[str, Any]:
    """
    Perform search over hyperparameter configurations. Only argument is ``tune_config``,
    a dictionary holding settings for training. The expected elements of this dictionary
    are documented below. This function returns a dictionary holding the results of
    training and the various parameter configurations used.

    Parameters
    ----------
    search_type : str
        Either "random", "grid", or "IC_grid", defines the search strategy to use.
    search_iterations : int
        Number of different hyperparameter configurations to try in search sequence. In
        cases where the number of configurations is determined by ``search_params``
        (such as when using grid search), the value of this variable is ignored, and the
        determined value is used instead.
    early_stop : Dict[str, int]
        Options to stop before reaching the end of training. This is mainly for
        simulating interruptions in test. Should have two keys, "iterations" and
        "trials", the corresponding value of each denotes how many of each to execute
        before stopping early. For example, {"iterations": 3", "trials": 1} will execute
        3 whole iterations, and 1 trial of the 4th iteration. If early stopping isn't
        desired, this value can just be set to None.
    trials_per_config : int
        Number of training runs to perform for each hyperparameter configuration. The
        fitness of each training run is averaged to produce an overall fitness for each
        hyperparameter configuration.
    base_train_config : Dict[str, Any]
        Config dictionary for function train() in meta/train.py. This is used as a
        starting point for hyperparameter search. It is required that each leaf element
        of this config dictionary have a unique key, i.e. a config containing
        base_train_config["key1"]["num_layers"] and
        base_train_config["key2"]["num_layers"] is invalid. This occurrence will cause
        unexpected behavior due to the implementation of update_config().
    search_params : Dict[str, Any]
        Search specifications for each parameter, such as max/min values, etc. The
        format of this dictionary varies between different search types.
    fitness_metric_name : str
        Name of metric (key in metrics dictionary returned from train()) to use as
        fitness function for hyperparameter search. Current supported values are
        "train_reward", "eval_reward", "train_success", "eval_success".
    fitness_metric_type : str
        Either "mean" or "maximum", used to determine which value of metric given in
        tune_config["fitnesss_metric_name"] to use as fitness, either the mean value at
        the end of training or the maximum value throughout training.
    seed : int
        Random seed for hyperparameter search.
    load_from : str
        Name of results directory to resume training from.
    """

    # Extract info from config.
    search_type = tune_config["search_type"]
    iterations = tune_config["search_iterations"]
    early_stop = tune_config["early_stop"]
    trials_per_config = tune_config["trials_per_config"]
    base_config = tune_config["base_train_config"]
    search_params = tune_config["search_params"]
    fitness_metric_name = tune_config["fitness_metric_name"]
    fitness_metric_type = tune_config["fitness_metric_type"]
    seed = tune_config["seed"]
    load_from = tune_config["load_from"]

    # Compute iterations from tune_config["search_params"] if necessary. When search
    # type is "grid" or "IC_grid", iterations must be computed from ``search_params``.
    if search_type in ["grid", "IC_grid"]:
        iterations = get_iterations(search_type, iterations, search_params)

    # Load checkpoint, if necessary.
    if load_from is not None:
        load_dir = save_dir_from_name(load_from)

        checkpoint_filename = os.path.join(load_dir, "checkpoint.pkl")
        with open(checkpoint_filename, "rb") as checkpoint_file:
            checkpoint = pickle.load(checkpoint_file)

        # Make sure current config and previous config line up.
        assert aligned_tune_configs(tune_config, checkpoint["tune_config"])

    else:
        load_dir = None
        checkpoint = None

    # Read in base name and make sure it is valid. Naming is slightly different for
    # different search strategies, so we do some weirdness here to make one function
    # which handles all cases. If it is valid, we make the save directory and save the
    # initial config.
    base_name = base_config["save_name"]
    if base_name is not None:

        # Compute previous checkpoint.
        start_pos = get_start_pos(search_type, checkpoint)

        # Edge case: If ``load_from == base_name``, then we exempt ``base_name`` from
        # the uniqueness check.
        exempt_base = load_from is not None and load_from == base_name

        # Check uniqueness of each training name.
        check_args = [
            base_name,
            search_type,
            iterations,
            trials_per_config,
            start_pos,
            exempt_base,
        ]
        if search_type == "IC_grid":
            num_param_values = get_num_param_values(search_params)
            check_args.append(num_param_values)
        check_name_uniqueness(*check_args)

        # Create save directory, if we aren't loading from an already existing directory
        # of the same name.
        save_dir = save_dir_from_name(base_name)
        if not exempt_base:
            os.makedirs(save_dir)

        # Save config.
        config_path = os.path.join(save_dir, "%s_config.json" % base_name)
        with open(config_path, "w") as config_file:
            json.dump(tune_config, config_file, indent=4)

    else:
        save_dir = None

    # Construct fitness function.
    if fitness_metric_name not in [
        "train_reward",
        "eval_reward",
        "train_success",
        "eval_success",
    ]:
        raise ValueError("Unsupported metric name: '%s'." % fitness_metric_name)
    if fitness_metric_type == "mean":
        fitness_fn = lambda metrics: metrics[fitness_metric_name]["mean"][-1]
    elif fitness_metric_type == "maximum":
        fitness_fn = lambda metrics: metrics[fitness_metric_name]["maximum"]
    else:
        raise ValueError("Unsupported metric type: '%s'." % fitness_metric_type)

    # Set random seed. Note that this may cause reproducibility issues since the train()
    # function now uses the random module.
    random.seed(seed)

    # Run the chosen search strategy.
    if tune_config["search_type"] == "random":
        search_fn = random_search
    elif tune_config["search_type"] == "grid":
        search_fn = grid_search
    elif tune_config["search_type"] == "IC_grid":
        search_fn = IC_grid_search
    results = search_fn(
        tune_config,
        base_config,
        iterations,
        early_stop,
        trials_per_config,
        fitness_fn,
        search_params,
        save_dir,
        checkpoint,
    )

    # Save results and config.
    if base_name is not None:

        # Save results.
        results_path = os.path.join(save_dir, "%s_results.json" % base_name)
        with open(results_path, "w") as results_file:
            json.dump(results, results_file, indent=4)

    return results