예제 #1
0
def wf_single_fit(fworker, fit_name, pipe_config, name, df, target, tags=None):
    """
    Submit a dataset to be fit for a single pipeline (i.e., to train on a
    dataset for real predictions).
    """

    # todo this is not working probably
    warnings.warn("Single fitted MatPipe not being stored in automatminer db "
                  "collections. Please consult fw_spec to find the benchmark "
                  "on {}".format(fworker))
    if fworker not in VALID_FWORKERS:
        raise ValueError("fworker must be in {}".format(VALID_FWORKERS))

    data_file = None

    now = get_time_str()
    base_save_dir = now + "_single_fit"

    spec = {
        "pipe_config": pipe_config,
        "base_save_dir": base_save_dir,
        "data_file": data_file,
        "target": target,
        "automatminer_commit": get_last_commit(),
        "tags": tags if tags else [],
        "_fworker": fworker,
    }

    fw_name = "{} single fit".format(name)
    wf_name = "single fit: {} ({}) [{}]".format(name, fit_name, fworker)

    fw = Firework(RunSingleFit(), spec=spec, name=fw_name)
    wf = Workflow([fw], metadata={"tags": tags}, name=wf_name)
    return wf
예제 #2
0
        else:
            raise ValueError(f"Worker {worker} not valid!")

        o_utf = o.decode("utf-8")
        o_all = o_utf.split("\n")
        o_all.remove("")
        password = o_all[-1]

        ssh = SSHClient()
        ssh.load_system_host_keys()
        ssh.connect(host,
                    username=user,
                    password=password,
                    look_for_keys=False)

        with SCPClient(ssh.get_transport()) as scp:
            scp.put(filepath,
                    recursive=True,
                    remote_path="/global/home/users/ardunn")
    else:
        pass


if __name__ == "__main__":
    import pandas as pd
    from matminer.datasets import load_dataset
    from automatminer_dev.workflows.util import get_time_str

    df = load_dataset("matbench_jdft2d")
    transfer_data(df, "lrc", get_time_str())
예제 #3
0
def wf_benchmark(
    fworker,
    pipe_config,
    name,
    data_file,
    target,
    problem_type,
    clf_pos_label,
    cache=True,
    kfold_config=KFOLD_DEFAULT,
    tags=None,
    return_fireworks=False,
    add_dataset_to_names=True,
    build_id=None,
    prepend_name="",
):
    if fworker not in VALID_FWORKERS:
        raise ValueError("fworker must be in {}".format(VALID_FWORKERS))

    # if fworker == "cori":
    #     n_cori_jobs = 32
    #     warnings.warn(
    #         "Worker is cori. Overriding n_jobs to {}".format(n_cori_jobs))
    #     pipe_config["learner_kwargs"]["n_jobs"] = n_cori_jobs
    #     pipe_config["autofeaturizer_kwargs"]["n_jobs"] = n_cori_jobs

    # Single (run) hash is the combination of pipe configuration + last commit
    # + data_file
    last_commit = get_last_commit()
    benchmark_config_for_hash = copy.deepcopy(pipe_config)
    benchmark_config_for_hash["last_commit"] = last_commit
    benchmark_config_for_hash["data_file"] = data_file
    benchmark_config_for_hash["worker"] = fworker
    benchmark_config_for_hash = str(benchmark_config_for_hash).encode("UTF-8")
    benchmark_hash = hashlib.sha1(benchmark_config_for_hash).hexdigest()[:10]
    base_save_dir = get_time_str() + "_" + benchmark_hash

    common_spec = {
        "pipe_config": pipe_config,
        "base_save_dir": base_save_dir,
        "kfold_config": kfold_config,
        "data_file": data_file,
        "target": target,
        "clf_pos_label": clf_pos_label,
        "problem_type": problem_type,
        "automatminer_commit": last_commit,
        "name": name,
        "benchmark_hash": benchmark_hash,
        "tags": tags if tags else [],
        "cache": cache,
        "build_id": build_id,
        "_fworker": fworker,
    }

    dataset_name = "" if not add_dataset_to_names else name + " "

    fws_all_folds = []
    kfold = KFold(**kfold_config)
    for fold in range(kfold.n_splits):
        save_dir = os.path.join("fold_{}".format(fold))
        foldspec = copy.deepcopy(common_spec)
        foldspec["fold"] = fold
        foldspec["save_dir"] = save_dir

        if fold == 0 and cache:
            pipename = "{}fold {} + featurization ({})".format(
                dataset_name, fold, benchmark_hash)
        else:
            pipename = "{}fold {} ({})".format(dataset_name, fold,
                                               benchmark_hash)

        fws_all_folds.append(
            Firework([RunPipe(), StorePipeResults()],
                     spec=foldspec,
                     name=pipename))

    fw_consolidate = Firework(
        ConsolidatePipesToBenchmark(),
        spec=common_spec,
        name="bench merge ({})".format(benchmark_hash),
    )

    if cache:
        fw_fold0 = fws_all_folds[0]
        fws_folds = fws_all_folds[1:]
        links = {fw: [fw_consolidate] for fw in fws_folds}
        links[fw_fold0] = fws_folds
        links[fw_consolidate] = []
    else:
        links = {fw: [fw_consolidate] for fw in fws_all_folds}
        links[fw_consolidate] = []
        fw_fold0 = fws_all_folds

    if return_fireworks:
        connected_to_top_wf = [fw_fold0] if cache else fw_fold0
        return links, connected_to_top_wf, fw_consolidate
    else:
        wf_name = "benchmark {}: ({}) [{}]".format(benchmark_hash, name,
                                                   fworker)
        if prepend_name:
            wf_name = "<<{}>> {}".format(prepend_name, wf_name)

        wf = Workflow(
            list(links.keys()),
            links_dict=links,
            name=wf_name,
            metadata={
                "benchmark_hash": benchmark_hash,
                "tags": tags
            },
        )
        return wf