Beispiel #1
0
def pipeline(num_shards=500, max_shard=3) -> co.Serial:
    root = co.Serial()
    # Download raw data
    root["Download"] = download_node(DATA_ROOT, num_shards, max_shard)

    # Compute covariance matrices. Use co.Lazy to generate tree
    #   (map) Compute covs in parallel, one for each tfrecord file (implemented, need tree)
    root["Compute covariance matrices"] = co.Lazy(
        compute_covs_node,
        in_glob=f"{DATA_ROOT}/train*.tfrecord",
        out_dir=COVS_ROOT)

    #   (reduce) Merge covariance matrices, using a 2-level reduce step: N->sqrt(N)->1 (implemented, need tree)
    root["Merge covariance matrices"] = co.Lazy(
        merge_covs_node,
        in_dir=COVS_ROOT,
        tmp_dir=MERGED_TMP,
        out_file=MERGED_FILE,
    )

    # Fit an OLS model using the covariance matrices (implemented, need tree)
    root["Models"] = co.Parallel()

    for ridge in [0, 1, 10, 100, 500]:
        name = "Linear" if ridge == 0 else f"Ridge={ridge}"
        model_node = co.Serial()

        model_node["Fit"] = co.Exec(
            commands.fit,
            in_path=MERGED_FILE,
            out_path=f"{MODEL_DIR}/{name}.pkl.gzip",
            ridge=ridge,
        )
        # Run a backtest on the validation data for each model (need to implement)
        model_node["Backtest"] = co.Lazy(
            backtest_node,
            model_path=f"{MODEL_DIR}/{name}.pkl.gzip",
            in_glob=f"{DATA_ROOT}/validate*.tfrecord",
            out_dir=f"{BACKTEST_ROOT}/{name}")
        model_node["Merge backtests"] = co.Exec(
            commands.merge_backtest,
            in_paths=[f"{BACKTEST_ROOT}/{name}/validate*.pkl.gzip"],
            out_path=f"{BACKTEST_ROOT}/{name}/summary.pkl.gzip")

        root["Models"][name] = model_node

    root["Summarize"] = co.Exec(
        commands.summarize, in_paths=[f"{BACKTEST_ROOT}/*/summary.pkl.gzip"])
    return root
Beispiel #2
0
def main(start_date="20120101") -> co.Serial:
    """
    Build a volume-prediction model for SPY.US. Steps:
    * Download data from S3 to the /conducto/data drive.
    * Compute features in parallel.
    * Build 3 models in parallel to predict volume.
    * For each model, fit, then do a parallel backtest.
    * Once all backtests are complete, summarize the results.
    """
    path = "/conducto/data/pipeline"

    root = co.Serial(image=_get_image(),
                     env={"PYTHONBREAKPOINT": "ipdb.set_trace"})
    root["Download"] = co.Exec(download_data, f"{path}/raw")

    # "Compute Features" should be parallelized at runtime, based on the actual
    # data downloaded in the previous step. Use co.Lazy to define and execute
    # this subtree.
    root["Compute Features"] = co.Lazy(
        make_compute_features_node,
        in_dir=f"{path}/raw",
        tmp_dir=f"{path}/feat/tmp",
        out_file=f"{path}/feat/merged.csv",
        start_date=start_date,
    )
    # Try three different model types
    root["Models"] = co.Parallel()
    for mdl in ["linear", "svm", "gradient_boost"]:
        # For each model, fit it, then backtest
        root["Models"][mdl] = fit_and_test = co.Serial()
        fit_and_test["Fit"] = co.Exec(
            fit,
            model_type=mdl,
            in_file=f"{path}/feat/merged.csv",
            out_file=f"{path}/fit/{mdl}",
        )
        fit_and_test["Backtest"] = co.Lazy(
            make_backtest_node,
            feature_dir=f"{path}/feat",
            model_file=f"{path}/fit/{mdl}",
            tmp_dir=f"{path}/results/tmp/{mdl}",
            out_file=f"{path}/results/{mdl}.csv",
        )

    # Analyze the results of the backtests and plot.
    root["Analyze"] = co.Exec(analyze, f"{path}/results")
    return root
Beispiel #3
0
def run() -> co.Serial:
    cfg = configparser.ConfigParser()
    cfg.read('config.ini'); # work config params (reps)
    reps = cfg['params']['replicates']
    print(f'running with {reps} replicates')
    image = co.Image(image="gbly/miniconda3", copy_dir=".", reqs_py=['conducto==0.0.67'])
    with co.Serial(image=image, doc=co.util.magic_doc()) as pipeline:
        #pipeline["python_trial"] = co.Exec("python -c 'import pandas as pd'")
        pipeline["parallel_experiment"] = co.Lazy(parallelize_reps, reps=int(reps))
        pipeline["plot_data"] = co.Exec(plot_reps, reps=int(reps))
    return pipeline
Beispiel #4
0
def main() -> co.Parallel:
    """
    Dynamically build pipelines for each actor in a static list.
    """
    actors = ["Oprah Winfrey", "Kate Mara", "Don Cheadle", "Dwayne Johnson"]
    root = co.Parallel(image=_get_image())
    for actor in actors:
        root[actor] = co.Lazy(
            f"python pipeline.py all_by_actor '{actor}'"
        )
    return root
def pipeline() -> co.Serial:

    # defer node definition until the first node runs
    root = co.Lazy(nodes_for_this_month)

    # conducto installs the dependencies into its image
    root.image = co.Image(
        copy_url="https://github.com/MatrixManAtYrService/sandboxen",
        copy_branch="master",
        path_map={".": "./fortune_witherror"},
        reqs_py=["conducto", "sh"],
        reqs_packages=["fortune"],
    )

    return root
Beispiel #6
0
def run() -> co.Serial:
    run.__doc__ = __doc__
    with co.Serial(image=utils.IMG, doc=co.util.magic_doc()) as output:
        output["gen_data"] = n = co.Exec(gen_data, WORDLIST_PATH, count=50000)
        n.doc = co.util.magic_doc(func=gen_data)

        output["parallel_word_count"] = n = co.Lazy(parallelize,
                                                    WORDLIST_PATH,
                                                    RESULT_DIR,
                                                    top=15,
                                                    chunksize=1000)
        n.doc = co.util.magic_doc(func=parallelize)
        n["Generate"].doc = None

        output["summarize"] = n = co.Exec(summarize, RESULT_DIR, top=15)
        n.doc = co.util.magic_doc(func=summarize)

    return output
Beispiel #7
0
def main() -> co.Serial:
    with co.Serial(image=get_image(), doc=ROOT_DOC) as output:
        output["Intro"] = co.Exec(f"figlet '_Conducto_ for Data Science'")

        with co.Parallel(name="LoadData", doc=LOAD_DOC) as load:
            load["Customer"] = co.Exec(PUT_CUSTOMER_DATA_CMD)
            load["Transaction"] = co.Exec(PUT_TRANSACTION_DATA_CMD)

        output["Join"] = co.Lazy(join_customer_transaction_data)
        output["Join"].doc = JOIN_DOC

        output["ComputeFeatures"] = co.Exec(COMPUTE_CMD, doc=COMPUTE_DOC)

        with co.Parallel(name="Models", doc=MODELS_DOC):
            for md in ["logistic", "random_forest", "gradient_boost"]:
                with co.Serial(name=md) as fit_and_test:
                    fit_and_test["Fit"] = co.Exec(FIT_CMD.format(md=md))
                    fit_and_test["Backtest"] = co.Exec(
                        BACKTEST_CMD.format(md=md))

        output["Analyze"] = co.Exec(ANALYZE_CMD, doc=ANALYZE_DOC)

    return output
Beispiel #8
0
def hello_self() -> co.Serial:
    pipeline = co.Serial(image=examples_img, env={"PYTHONPATH": "."})
    pipeline["Say Hi"] = co.Lazy(get_pipeline)
    return pipeline
Beispiel #9
0
def make_pipeline() -> co.Serial:
    root = co.Serial(image=img)
    root['fortune'] = co.Lazy(nodes_for_this_month)
    return root