Python set_rand_seed Exemples, utils.set_rand_seed Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    """ This program's entrypoint. """
    utils.set_rand_seed()

    psr = argparse.ArgumentParser(
        description=(
            "Merges parsed experiment files into unified training, validation, "
            "and test data."))
    psr.add_argument(
        "--data-dir",
        help="The path to a directory containing the experiment files.",
        required=True, type=str)
    psr.add_argument(
        "--train-split", default=50, help="Training data fraction",
        required=False, type=float)
    psr.add_argument(
        "--val-split", default=20, help="Validation data fraction",
        required=False, type=float)
    psr.add_argument(
        "--test-split", default=30, help="Test data fraction",
        required=False, type=float)
    psr, psr_verify = cl_args.add_sample_percent(*cl_args.add_out(
        *cl_args.add_warmup(*cl_args.add_num_exps(psr))))
    args = psr_verify(psr.parse_args())

    split_fracs = {
        "train": args.train_split / 100, "val": args.val_split / 100,
        "test": args.test_split / 100}
    tot_split = sum(split_fracs.values())
    assert tot_split == 1, \
        ("The sum of the training, validation, and test splits must equal 100, "
         f"not {tot_split * 100}")

    tim_srt_s = time.time()
    # Determine the experiment filepaths.
    exps_dir = args.data_dir
    exp_flps = [
        path.join(exps_dir, fln) for fln in os.listdir(exps_dir)
        if not fln.startswith(defaults.DATA_PREFIX) and fln.endswith(".npz")]
    random.shuffle(exp_flps)
    num_exps = len(exp_flps) if args.num_exps is None else args.num_exps
    exp_flps = exp_flps[:num_exps]
    print(f"Selected {num_exps} experiments")
    warmup_frac = args.warmup_percent / 100
    sample_frac = args.sample_percent / 100
    exp_flps, num_pkts, dtype = survey(exp_flps, warmup_frac)
    print(
        f"Total packets: {num_pkts}\nFeatures ({len(dtype.names)}):\n\t" +
        "\n\t".join(sorted(dtype.names)))

    # Create the merged training, validation, and test files.
    merge(
        exp_flps, args.out_dir, num_pkts, dtype, split_fracs, warmup_frac,
        sample_frac)
    print(f"Finished - time: {time.time() - tim_srt_s:.2f} seconds")
    return 0

Exemple #2

0

Afficher le fichier

def main():
    """ This program's entrypoint. """
    # Parse command line arguments.
    psr = argparse.ArgumentParser(
        description="Parses the output of gen_training_data.py.")
    psr.add_argument(
        "--exp-dir",
        help=("The directory in which the experiment results are stored "
              "(required)."),
        required=True,
        type=str)
    psr.add_argument("--random-order",
                     action="store_true",
                     help="Parse the simulations in a random order.")
    psr, psr_verify = cl_args.add_out(psr)
    args = psr_verify(psr.parse_args())
    exp_dir = args.exp_dir
    out_dir = args.out_dir

    # Find all simulations.
    pcaps = [(path.join(exp_dir, sim), out_dir)
             for sim in sorted(os.listdir(exp_dir))]
    if args.random_order:
        # Set the random seed so that multiple instances of this
        # script see the same random order.
        utils.set_rand_seed()
        random.shuffle(pcaps)

    print(f"Num files: {len(pcaps)}")
    tim_srt_s = time.time()
    if defaults.SYNC:
        for pcap in pcaps:
            parse_pcap(*pcap)
    else:
        with multiprocessing.Pool() as pol:
            pol.starmap(parse_pcap, pcaps)
    print(f"Done parsing - time: {time.time() - tim_srt_s:.2f} seconds")

Exemple #3

0

Afficher le fichier

Fichier : correlation.py Projet : marciopocebon/unfair

def main():
    """ This program's entrypoint. """
    # Parse command line arguments.
    psr = argparse.ArgumentParser(description="Evaluates feature correlation.")
    psr, psr_verify = cl_args.add_training(psr)
    args = psr_verify(psr.parse_args())

    # Train models.
    # all_fets = sorted(models.MODELS[args.model].in_spc)
    all_fets = ALL_FETS

    # x-axis features.
    fets_x = list(reversed(all_fets))
    # y-axis features.
    fets_y = all_fets

    out_dir = args.out_dir
    dat_flp = path.join(out_dir, "correlation.npz")
    if path.exists(dat_flp):
        # Load existing results.
        print(f"Found existing data: {dat_flp}")
        with np.load(dat_flp) as fil:
            accs_ratios = fil[fil.files[0]]
    else:
        # Create the list of simulations here instead of letting the
        # training script do it so that all runs use the same
        # simulations.
        dat_dir = args.data_dir
        sims = [path.join(dat_dir, sim) for sim in os.listdir(dat_dir)]
        if train.SHUFFLE:
            # Set the random seed so that multiple instances of this
            # script see the same random order.
            utils.set_rand_seed()
            random.shuffle(sims)
        num_sims = args.num_sims
        if num_sims is not None:
            num_sims_actual = len(sims)
            assert num_sims_actual >= num_sims, \
                (f"Insufficient simulations. Requested {num_sims}, but only "
                 f"{num_sims_actual} available.")
            sims = sims[:num_sims]

        # Train models.
        accs_single = run_cnfs([[fet] for fet in all_fets], args, sims)
        accs_pairs = run_cnfs(
            [
                [fet1, fet2] for i, fet1 in enumerate(all_fets)
                for j, fet2 in enumerate(all_fets)
                # Do not consider pairs of the same feature.
                if i != j
            ],
            args,
            sims)
        # Calculate the accuracy ratios.
        accs_ratios = np.array([[
            (accs_pairs[(fet1, fet2)] / accs_single[(fet1, )] if
             (fet1, fet2) in accs_pairs else 0) for fet2 in fets_x
        ] for fet1 in fets_y])
        # Save results.
        np.savez_compressed(dat_flp, accs_ratios=accs_ratios)
        print(f"Saving results: {dat_flp}")

    # Graph results.
    plt.subplots(figsize=(8, 7))
    with sns.axes_style("white"):
        sns.heatmap(accs_ratios,
                    linewidth=0.5,
                    center=1,
                    xticklabels=fets_x,
                    yticklabels=fets_y,
                    square=True,
                    annot=True,
                    fmt=".2f",
                    annot_kws={"fontsize": 8})
    plt.tight_layout()
    out_flp = path.join(out_dir, "correlation.pdf")
    print(f"Saving graph: {out_flp}")
    plt.savefig(out_flp)

Exemple #4

0

Afficher le fichier

Fichier : train.py Projet : cmu-snap/unfair

def run_trials(args):
    """
    Runs args["conf_trials"] trials and survives args["max_attempts"] failed
    attempts.
    """
    print(f"Arguments: {args}")

    if args["no_rand"]:
        utils.set_rand_seed()
    # Prepare the output directory.
    out_dir = args["out_dir"]
    if not path.isdir(out_dir):
        print(f"Output directory does not exist. Creating it: {out_dir}")
        os.makedirs(out_dir)
    # Create a temporary model to use during the data preparation
    # process. Another model will be created for the actual training.
    net_tmp = models.MODELS[args["model"]]()
    # Verify that the necessary supplemental parameters are present.
    for param in net_tmp.params:
        assert param in args, f"\"{param}\" not in args: {args}"
    # Assemble the output filepath.
    out_flp = path.join(
        args["out_dir"],
        defaults.MODEL_PREFIX + utils.args_to_str(
            args, order=sorted(defaults.DEFAULTS.keys()), which="model") + (
                # Determine the proper extension based on the type of model.
                ".pickle"
                if isinstance(net_tmp, models.SvmSklearnWrapper) else ".pth"))
    # If custom features are specified, then overwrite the model's
    # default features.
    fets = args["features"]
    if fets:
        net_tmp.in_spc = tuple(fets)
    else:
        args["features"] = net_tmp.in_spc

    # Load the training, validation, and test data.
    ldrs = data.get_dataloaders(args, net_tmp)

    # TODO: Parallelize attempts.
    trls = args["conf_trials"]
    apts = 0
    apts_max = args["max_attempts"]
    ress = []
    while trls > 0 and apts < apts_max:
        apts += 1
        res = (run_sklearn if isinstance(net_tmp, models.SvmSklearnWrapper)
               else run_torch)(args, out_dir, out_flp, ldrs)
        if res[0] == 100:
            print((
                f"Training failed (attempt {apts}/{apts_max}). Trying again!"))
        else:
            ress.append(res)
            trls -= 1
    if ress:
        print(("Resulting accuracies: "
               f"{', '.join([f'{acc:.2f}' for acc, _ in ress])}"))
        max_acc, tim_s = max(ress, key=lambda p: p[0])
        print(f"Maximum accuracy: {max_acc:.2f}")
        # Return the minimum error instead of the maximum accuracy.
        return 1 - max_acc, tim_s
    print(f"Model cannot be trained with args: {args}")
    return float("NaN"), float("NaN")

Exemple #5

0

Afficher le fichier

Fichier : prepare_data.py Projet : marciopocebon/unfair

def main():
    """ This program's entrypoint. """
    utils.set_rand_seed()

    psr = argparse.ArgumentParser(description=(
        "Merges parsed simulation files into unified training, validation, "
        "and test data."))
    psr.add_argument(
        "--data-dir",
        help="The path to a directory containing the simulation files.",
        required=True,
        type=str)
    psr.add_argument("--train-split",
                     default=50,
                     help="Training data fraction",
                     required=False,
                     type=float)
    psr.add_argument("--val-split",
                     default=20,
                     help="Validation data fraction",
                     required=False,
                     type=float)
    psr.add_argument("--test-split",
                     default=30,
                     help="Test data fraction",
                     required=False,
                     type=float)
    psr, psr_verify = cl_args.add_out(*cl_args.add_warmup(
        *cl_args.add_num_sims(psr)))
    args = psr_verify(psr.parse_args())

    split_prcs = {
        "train": args.train_split,
        "val": args.val_split,
        "test": args.test_split
    }
    tot_split = sum(split_prcs.values())
    assert tot_split == 100, \
        ("The sum of the training, validation, and test splits must equal 100, "
         f"not {tot_split}")

    tim_srt_s = time.time()
    # Determine the simulation filepaths.
    sims_dir = args.data_dir
    sim_flns = os.listdir(sims_dir)
    random.shuffle(sim_flns)
    num_sims = args.num_sims
    num_sims = len(sim_flns) if num_sims is None else num_sims
    print(f"Selected {num_sims} simulations")
    sim_flps = [
        path.join(sims_dir, sim_fln) for sim_fln in sim_flns[:num_sims]
    ]
    warmup_frac = args.warmup_percent / 100
    num_pkts, dtype = survey(sim_flps, warmup_frac)
    fets = dtype.names
    print(f"Total packets: {num_pkts}\nFeatures:\n    " +
          "\n    ".join(sorted(fets)))

    # Create the merged training, validation, and test files.
    merge(sim_flps, args.out_dir, num_pkts, dtype, split_prcs, warmup_frac)
    print(f"Finished - time: {time.time() - tim_srt_s:.2f} seconds")
    return 0

Exemple #6

0

Afficher le fichier

Fichier : train.py Projet : marciopocebon/unfair

def run_trials(args):
    """
    Run args["conf_trials"] trials and survive args["max_attempts"] failed
    attempts.
    """
    print(f"Arguments: {args}")

    if args["no_rand"]:
        utils.set_rand_seed()

    out_dir = args["out_dir"]
    if not path.isdir(out_dir):
        print(f"Output directory does not exist. Creating it: {out_dir}")
        os.makedirs(out_dir)
    net_tmp = models.MODELS[args["model"]]()
    # Verify that the necessary supplemental parameters are present.
    for param in net_tmp.params:
        assert param in args, f"\"{param}\" not in args: {args}"
    # Assemble the output filepath.
    out_flp = path.join(
        args["out_dir"],
        (utils.args_to_str(args, order=sorted(defaults.DEFAULTS.keys()))) + (
            # Determine the proper extension based on the type of
            # model.
            ".pickle"
            if isinstance(net_tmp, models.SvmSklearnWrapper) else ".pth"))
    # If custom features are specified, then overwrite the model's
    # default features.
    fets = args["features"]
    if fets:
        net_tmp.in_spc = fets
    else:
        assert "arrival time us" not in args["features"]
        args["features"] = net_tmp.in_spc
    # If a trained model file already exists, then delete it.
    if path.exists(out_flp):
        os.remove(out_flp)

    # Load or geenrate training data.
    dat_flp = path.join(out_dir, "data.npz")
    scl_prms_flp = path.join(out_dir, "scale_params.json")
    # Check for the presence of both the data and the scaling
    # parameters because the resulting model is useless without the
    # proper scaling parameters.
    if (not args["regen_data"] and path.exists(dat_flp)
            and path.exists(scl_prms_flp)):
        print("Found existing data!")
        dat_in, dat_out, dat_out_raw, dat_out_oracle, num_flws = utils.load(
            dat_flp)
        dat_in_shape = dat_in.shape
        dat_out_shape = dat_out.shape
        assert dat_in_shape[0] == dat_out_shape[0], \
            f"Data has invalid shapes! in: {dat_in_shape}, out: {dat_out_shape}"
    else:
        print("Regenerating data...")
        dat_in, dat_out, dat_out_raw, dat_out_oracle, num_flws = (gen_data(
            net_tmp, args, dat_flp, scl_prms_flp))
    print(f"Number of input features: {len(dat_in.dtype.names)}")

    # Visualaize the ground truth data.
    utils.visualize_classes(net_tmp, dat_out)

    # TODO: Parallelize attempts.
    trls = args["conf_trials"]
    apts = 0
    apts_max = args["max_attempts"]
    ress = []
    while trls > 0 and apts < apts_max:
        apts += 1
        res = (run_sklearn if isinstance(net_tmp, models.SvmSklearnWrapper)
               else run_torch)(args, dat_in, dat_out, dat_out_raw,
                               dat_out_oracle, num_flws, out_dir, out_flp)
        if res[0] == 100:
            print((
                f"Training failed (attempt {apts}/{apts_max}). Trying again!"))
        else:
            ress.append(res)
            trls -= 1
    if ress:
        print(("Resulting accuracies: "
               f"{', '.join([f'{acc:.2f}' for acc, _ in ress])}"))
        max_acc, tim_s = max(ress, key=lambda p: p[0])
        print(f"Maximum accuracy: {max_acc:.2f}")
        # Return the minimum error instead of the maximum accuracy.
        return 1 - max_acc, tim_s
    print(f"Model cannot be trained with args: {args}")
    return float("NaN"), float("NaN")

Exemple #7

0

Afficher le fichier

Fichier : train.py Projet : marciopocebon/unfair

def make_datasets(net, args, dat=None):
    """
    Parses the simulation files in data_dir and transforms them (e.g., by
    scaling) into the correct format for the network.

    If num_sims is not None, then this function selects the first num_sims
    simulations only. If shuffle is True, then the simulations will be parsed in
    sorted order. Use num_sims and shuffle=True together to simplify debugging.
    """
    if dat is None:
        # Find simulations.
        sims = args["sims"]
        if not sims:
            dat_dir = args["data_dir"]
            sims = [
                path.join(dat_dir, sim) for sim in sorted(os.listdir(dat_dir))
            ]
        if SHUFFLE:
            # Set the random seed so that multiple parallel instances of
            # this script see the same random order.
            utils.set_rand_seed()
            random.shuffle(sims)
        num_sims = args["num_sims"]
        if num_sims is not None:
            num_sims_actual = len(sims)
            assert num_sims_actual >= num_sims, \
                (f"Insufficient simulations. Requested {num_sims}, but only "
                 f"{num_sims_actual} available.")
            sims = sims[:num_sims]
        tot_sims = len(sims)
        print(f"Found {tot_sims} simulations.")

        # Prepare temporary output directory. The output of parsing each
        # simulation it written to disk instead of being transfered
        # between processes because sometimes the data is too large for
        # Python to send between processes.
        tmp_dir = args["tmp_dir"]
        if tmp_dir is None:
            tmp_dir = args["out_dir"]
        if not path.isdir(tmp_dir):
            print(
                f"Temporary directory does not exist. Creating it: {tmp_dir}")
            os.makedirs(tmp_dir)

        # Parse simulations.
        sims_args = [(idx, tot_sims, net, sim, tmp_dir, args["warmup_percent"],
                      args["keep_percent"]) for idx, sim in enumerate(sims)]
        if defaults.SYNC or args["sync"]:
            dat_all = [process_sim(*sim_args) for sim_args in sims_args]
        else:
            with multiprocessing.Pool() as pol:
                # Each element of dat_all corresponds to a single simulation.
                dat_all = pol.starmap(process_sim, sims_args)
        # Throw away results from simulations that could not be parsed.
        dat_all = [dat for dat in dat_all if dat is not None]
        print(f"Discarded {tot_sims - len(dat_all)} simulations!")
        assert dat_all, "No valid simulations found!"
        dat_all, sims = zip(*dat_all)

        dat_all = [utils.load_tmp_file(flp) for flp in dat_all]
    else:
        dat_all, sims = dat

    # Validate data.
    dim_in = None
    dtype_in = None
    dim_out = None
    dtype_out = None
    scl_grps = None
    for dat_in, dat_out, _, _, scl_grps_cur in dat_all:
        dim_in_cur = len(dat_in.dtype.names)
        dim_out_cur = len(dat_out.dtype.names)
        dtype_in_cur = dat_in.dtype
        dtype_out_cur = dat_out.dtype
        if dim_in is None:
            dim_in = dim_in_cur
        if dim_out is None:
            dim_out = dim_out_cur
        if dtype_in is None:
            dtype_in = dtype_in_cur
        if dtype_out is None:
            dtype_out = dtype_out_cur
        if scl_grps is None:
            scl_grps = scl_grps_cur
        assert dim_in_cur == dim_in, \
            f"Invalid input feature dim: {dim_in_cur} != {dim_in}"
        assert dim_out_cur == dim_out, \
            f"Invalid output feature dim: {dim_out_cur} != {dim_out}"
        assert dtype_in_cur == dtype_in, \
            f"Invalud input dtype: {dtype_in_cur} != {dtype_in}"
        assert dtype_out_cur == dtype_out, \
            f"Invalid output dtype: {dtype_out_cur} != {dtype_out}"
        assert (scl_grps_cur == scl_grps).all(), \
            f"Invalid scaling groups: {scl_grps_cur} != {scl_grps}"
    assert dim_in is not None, "Unable to compute input feature dim!"
    assert dim_out is not None, "Unable to compute output feature dim!"
    assert dtype_in is not None, "Unable to compute input dtype!"
    assert dtype_out is not None, "Unable to compute output dtype!"
    assert scl_grps is not None, "Unable to compte scaling groups!"

    # Build combined feature lists.
    dat_in_all, dat_out_all, dat_out_all_raw, dat_out_all_oracle, _ = zip(
        *dat_all)
    # Determine the number of flows in each example.
    num_flws = [sim.unfair_flws + sim.fair_flws for sim in sims]
    num_flws = [
        np.array([num_flws_] * dat_in.shape[0], dtype=[("num_flws", "int")])
        for num_flws_, dat_in in zip(num_flws, dat_in_all)
    ]
    num_flws = np.concatenate(num_flws, axis=0)
    # Stack the arrays.
    dat_in_all = np.concatenate(dat_in_all, axis=0)
    dat_out_all = np.concatenate(dat_out_all, axis=0)
    dat_out_all_raw = np.concatenate(dat_out_all_raw, axis=0)
    dat_out_all_oracle = np.concatenate(dat_out_all_oracle, axis=0)

    # Convert all instances of -1 (feature value unknown) to the mean
    # for that feature.
    bad_fets = []
    for fet in dat_in_all.dtype.names:
        fet_values = dat_in_all[fet]
        if (fet_values == -1).all():
            bad_fets.append(fet)
            continue
        dat_in_all[fet] = np.where(fet_values == -1, np.mean(fet_values),
                                   fet_values)
        assert (dat_in_all[fet] != -1).all(), f"Found \"-1\" in feature: {fet}"
    assert not bad_fets, f"Features contain only \"-1\": {bad_fets}"

    # Scale input features. Do this here instead of in process_sim()
    # because all of the features must be scaled using the same
    # parameters.
    dat_in_all, prms_in = scale_fets(dat_in_all, scl_grps, args["standardize"])

    # # Check if any of the data is malformed and discard features if
    # # necessary.
    # fets = []
    # for fet in dat_in_all.dtype.names:
    #     fet_values = dat_in_all[fet]
    #     if ((not np.isnan(fet_values).any()) and
    #             (not np.isinf(fet_values).any())):
    #         fets.append(fet)
    #     else:
    #         print(f"Discarding: {fet}")
    # dat_in_all = dat_in_all[fets]

    return (dat_in_all, dat_out_all, dat_out_all_raw, dat_out_all_oracle,
            num_flws, prms_in)

Exemple #8

0

Afficher le fichier

Fichier : training_param_sweep.py Projet : marciopocebon/unfair

def main():
    """ This program's entrypoint. """
    psr = argparse.ArgumentParser(
        description="Visualize sklearn training parameters.")
    psr, psr_verify = cl_args.add_training(psr)
    psr.add_argument(
        "--graph-results", action="store_true",
        help=("Look through the output directory for completed experiments, "
              "and graph them."))
    args = psr_verify(psr.parse_args())
    args = train.prepare_args(vars(args))
    tim_srt_s = time.time()
    out_dir = args["out_dir"]

    # Assemble the configurations to test. Sort them based on the
    # product of their hyper-parameters, which is a heuristic of how
    # long they will take to run.
    cnfs = sorted(
        [train.prepare_args({
            "warmup_percent": args["warmup_percent"],
            "model": args["model"],
            "kernel": args["kernel"],
            "degree": args["degree"],
            "penalty": args["penalty"],
            "standardize": args["standardize"],
            "no_rand": args["no_rand"],
            "max_iter": max_iter,
            "keep_percent": prc,
            "num_sims": num_sims,
            "out_dir": path.join(out_dir, f"{num_sims}_{prc}_{max_iter}")
        }) for num_sims, prc, max_iter in set(
            # Fix number of iterations and number of
            # simulations. Vary percent of each simulation.
            list(itertools.product(
                NUMS_SIMS, range(PRC_MIN, PRC_MAX + 1, PRC_DELTA),
                NUMS_ITERS)) +
            # Fix percent of each simulation and number of
            # simulations. Vary number of iterations.
            list(itertools.product(
                NUMS_SIMS, KEEP_PRCS,
                range(NUM_ITERS_MIN, NUM_ITERS_MAX + 1, NUM_ITERS_DELTA))))],
        key=lambda cnf: np.prod(
            [cnf["num_sims"], cnf["keep_percent"], cnf["max_iter"]]))
    print(f"Will test {len(cnfs)} configurations.")

    if args["graph_results"]:
        graph_partial_results(cnfs, out_dir)
        return

    # For each possible configuration of number of simulations and
    # percentage of each simulation, create a temporary file
    # containing the parsed data for that configuration.
    all_prcs = list(set(
        KEEP_PRCS + list(range(PRC_MIN, PRC_MAX + 1, PRC_DELTA))))
    tmp_dat = {}
    for num_sims, prc in itertools.product(NUMS_SIMS, all_prcs):
        base_dir = path.join(out_dir, f"{num_sims}_{prc}")
        if not path.exists(base_dir):
            os.makedirs(base_dir)
        tmp_dat_flp = path.join(base_dir, "data.npz")
        tmp_scl_prms_flp = path.join(base_dir, "scale_params.json")
        # Record the paths to the data and scale parameters files.
        tmp_dat[(num_sims, prc)] = (tmp_dat_flp, tmp_scl_prms_flp)

    # Look up to the location of the data corresponding to each
    # configuration and make a mapping from this temporary data to
    # where it should be copied in the configuration's output
    # directory (where it will be read automatically).
    src_dst = []
    for cnf in cnfs:
        cnf_out_dir = cnf["out_dir"]
        if not path.exists(cnf_out_dir):
            os.makedirs(cnf_out_dir)
        tmp_dat_flp, tmp_scl_prms_flp = tmp_dat[
            (cnf["num_sims"], cnf["keep_percent"])]
        src_dst.append((
            (tmp_dat_flp,
             path.join(cnf_out_dir, path.basename(tmp_dat_flp))),
            (tmp_scl_prms_flp,
             path.join(cnf_out_dir, path.basename(tmp_scl_prms_flp)))))

    # Check if any of the data has not been generated yet. If any of
    # the data has not been generated yet, then we must regenerate all
    # of the data.
    if np.array([
            path.exists(dat_dst) and path.exists(scl_prms_dst)
            for (_, dat_dst), (_, scl_prms_dst) in src_dst]).all():
        print("All data already generated.")
    else:
        print("Generating all new data.")
        # The data processing here is tricky. Our goal is to have
        # the configurations all process the same data to make the
        # comparison as "apples-to-apples" as possible. To that
        # end, all of the configurations will process subsets of
        # the same set of simulations. The high-level strategy is
        # to manually load and process the data and then place it
        # strategically so that the training process reads it in
        # automatically. The process is as follows:
        #    1) Determine the greatest number of simulations that any
        #       configuration will require and parse them.
        #    2) For each configuration, pick a subset of these simulations.
        #       When picking which simulations to use, always pick from the
        #       front. This means that configurations with more simulations
        #       will process the same simulations as those with fewer
        #       simulations, plus some extra. When picking which datapoints
        #       to use from within a simulation, choose them randomly.
        #    3) For each configuration, copy its data into its output
        #       directory so that it will be automatically detected and read
        #       in.

        # Load the required number of simulations.
        dat_dir = args["data_dir"]
        sims = [path.join(dat_dir, sim) for sim in os.listdir(dat_dir)]
        if train.SHUFFLE:
            # Set the random seed so that multiple instances of this
            # script see the same random order.
            utils.set_rand_seed()
            random.shuffle(sims)
        num_sims_actual = len(sims)
        max_sims = max(NUMS_SIMS)
        assert num_sims_actual >= max_sims, \
            (f"Insufficient simulations. Requested {max_sims}, but only "
             f"{num_sims_actual} available.")
        sims = sims[:max_sims]
        net = models.MODELS[args["model"]]()
        sim_args = [
            (idx, max_sims, net, sim_flp, out_dir, args["warmup_percent"], 100)
            for idx, sim_flp in enumerate(sims)]
        if defaults.SYNC:
            dat_all = [train.process_sim(*sim_args_) for sim_args_ in sim_args]
        else:
            with multiprocessing.Pool() as pol:
                dat_all = pol.starmap(train.process_sim, sim_args)
        # Verify that we were able to parse all
        # simulations. Normally, we would allow the training
        # process to proceed even if some simulations failed to
        # parse, but since in this case we are looking at specific
        # trends, we need to make sure that we are training on the
        # number of simulations that we intend.
        for dat in dat_all:
            assert dat is not None, \
                "Error processing at least one simulation. Check logs (above)."
        # Unpack the data.
        dat_all, sims = zip(*dat_all)
        dat_all = [utils.load_tmp_file(flp) for flp in dat_all]
        dat_in, dat_out, dat_out_raw, dat_out_oracle, scl_grps = zip(*dat_all)
        dat_in = list(dat_in)
        dat_out = list(dat_out)
        dat_out_raw = list(dat_out_raw)
        dat_out_oracle = list(dat_out_oracle)
        scl_grps = list(scl_grps)

        # Generate temporary data.
        for (num_sims, prc), (tmp_dat_flp, tmp_scl_prms_flp) in tmp_dat.items():
            # Select the data corresponding to this number of
            # simulations and percent of each simulation.
            dat_all = list(zip(*utils.filt(
                dat_in, dat_out, dat_out_raw, dat_out_oracle, scl_grps,
                num_sims, prc)))
            # Finish processesing the data and save it in a form that
            # can be read by the training process.
            ignore = train.gen_data(
                net, args, dat_flp=tmp_dat_flp, scl_prms_flp=tmp_scl_prms_flp,
                dat=(dat_all, sims), save_data=True)
            del ignore

        # Copy temporary data to configuration output directories.
        for (dat_src, dat_dst), (scl_prms_src, scl_prms_dst)  in src_dst:
            shutil.copyfile(dat_src, dat_dst)
            shutil.copyfile(scl_prms_src, scl_prms_dst)

        # Remove temporary data files.
        for num_sims, prc in itertools.product(NUMS_SIMS, all_prcs):
            tmp_dir = path.join(out_dir, f"{num_sims}_{prc}")
            print(f"Removing: {tmp_dir}")
            shutil.rmtree(tmp_dir)

    # Train models.
    train.run_cnfs(
        cnfs, defaults.SYNC, maybe_run_cnf, cleanup_combine_and_save_results)

    # # Remove real data files.
    # for cnf in cnfs:
    #     print(f"Removing: {cnf['out_dir']}")
    #     shutil.rmtree(cnf["out_dir"])

    graph_partial_results(cnfs, out_dir)
    print(f"Total time: {time.time() - tim_srt_s:.2f} seconds")
    return