def pipeline(num_shards=500, max_shard=3) -> co.Serial: root = co.Serial() # Download raw data root["Download"] = download_node(DATA_ROOT, num_shards, max_shard) # Compute covariance matrices. Use co.Lazy to generate tree # (map) Compute covs in parallel, one for each tfrecord file (implemented, need tree) root["Compute covariance matrices"] = co.Lazy( compute_covs_node, in_glob=f"{DATA_ROOT}/train*.tfrecord", out_dir=COVS_ROOT) # (reduce) Merge covariance matrices, using a 2-level reduce step: N->sqrt(N)->1 (implemented, need tree) root["Merge covariance matrices"] = co.Lazy( merge_covs_node, in_dir=COVS_ROOT, tmp_dir=MERGED_TMP, out_file=MERGED_FILE, ) # Fit an OLS model using the covariance matrices (implemented, need tree) root["Models"] = co.Parallel() for ridge in [0, 1, 10, 100, 500]: name = "Linear" if ridge == 0 else f"Ridge={ridge}" model_node = co.Serial() model_node["Fit"] = co.Exec( commands.fit, in_path=MERGED_FILE, out_path=f"{MODEL_DIR}/{name}.pkl.gzip", ridge=ridge, ) # Run a backtest on the validation data for each model (need to implement) model_node["Backtest"] = co.Lazy( backtest_node, model_path=f"{MODEL_DIR}/{name}.pkl.gzip", in_glob=f"{DATA_ROOT}/validate*.tfrecord", out_dir=f"{BACKTEST_ROOT}/{name}") model_node["Merge backtests"] = co.Exec( commands.merge_backtest, in_paths=[f"{BACKTEST_ROOT}/{name}/validate*.pkl.gzip"], out_path=f"{BACKTEST_ROOT}/{name}/summary.pkl.gzip") root["Models"][name] = model_node root["Summarize"] = co.Exec( commands.summarize, in_paths=[f"{BACKTEST_ROOT}/*/summary.pkl.gzip"]) return root
def main(start_date="20120101") -> co.Serial: """ Build a volume-prediction model for SPY.US. Steps: * Download data from S3 to the /conducto/data drive. * Compute features in parallel. * Build 3 models in parallel to predict volume. * For each model, fit, then do a parallel backtest. * Once all backtests are complete, summarize the results. """ path = "/conducto/data/pipeline" root = co.Serial(image=_get_image(), env={"PYTHONBREAKPOINT": "ipdb.set_trace"}) root["Download"] = co.Exec(download_data, f"{path}/raw") # "Compute Features" should be parallelized at runtime, based on the actual # data downloaded in the previous step. Use co.Lazy to define and execute # this subtree. root["Compute Features"] = co.Lazy( make_compute_features_node, in_dir=f"{path}/raw", tmp_dir=f"{path}/feat/tmp", out_file=f"{path}/feat/merged.csv", start_date=start_date, ) # Try three different model types root["Models"] = co.Parallel() for mdl in ["linear", "svm", "gradient_boost"]: # For each model, fit it, then backtest root["Models"][mdl] = fit_and_test = co.Serial() fit_and_test["Fit"] = co.Exec( fit, model_type=mdl, in_file=f"{path}/feat/merged.csv", out_file=f"{path}/fit/{mdl}", ) fit_and_test["Backtest"] = co.Lazy( make_backtest_node, feature_dir=f"{path}/feat", model_file=f"{path}/fit/{mdl}", tmp_dir=f"{path}/results/tmp/{mdl}", out_file=f"{path}/results/{mdl}.csv", ) # Analyze the results of the backtests and plot. root["Analyze"] = co.Exec(analyze, f"{path}/results") return root
def run() -> co.Serial: cfg = configparser.ConfigParser() cfg.read('config.ini'); # work config params (reps) reps = cfg['params']['replicates'] print(f'running with {reps} replicates') image = co.Image(image="gbly/miniconda3", copy_dir=".", reqs_py=['conducto==0.0.67']) with co.Serial(image=image, doc=co.util.magic_doc()) as pipeline: #pipeline["python_trial"] = co.Exec("python -c 'import pandas as pd'") pipeline["parallel_experiment"] = co.Lazy(parallelize_reps, reps=int(reps)) pipeline["plot_data"] = co.Exec(plot_reps, reps=int(reps)) return pipeline
def main() -> co.Parallel: """ Dynamically build pipelines for each actor in a static list. """ actors = ["Oprah Winfrey", "Kate Mara", "Don Cheadle", "Dwayne Johnson"] root = co.Parallel(image=_get_image()) for actor in actors: root[actor] = co.Lazy( f"python pipeline.py all_by_actor '{actor}'" ) return root
def pipeline() -> co.Serial: # defer node definition until the first node runs root = co.Lazy(nodes_for_this_month) # conducto installs the dependencies into its image root.image = co.Image( copy_url="https://github.com/MatrixManAtYrService/sandboxen", copy_branch="master", path_map={".": "./fortune_witherror"}, reqs_py=["conducto", "sh"], reqs_packages=["fortune"], ) return root
def run() -> co.Serial: run.__doc__ = __doc__ with co.Serial(image=utils.IMG, doc=co.util.magic_doc()) as output: output["gen_data"] = n = co.Exec(gen_data, WORDLIST_PATH, count=50000) n.doc = co.util.magic_doc(func=gen_data) output["parallel_word_count"] = n = co.Lazy(parallelize, WORDLIST_PATH, RESULT_DIR, top=15, chunksize=1000) n.doc = co.util.magic_doc(func=parallelize) n["Generate"].doc = None output["summarize"] = n = co.Exec(summarize, RESULT_DIR, top=15) n.doc = co.util.magic_doc(func=summarize) return output
def main() -> co.Serial: with co.Serial(image=get_image(), doc=ROOT_DOC) as output: output["Intro"] = co.Exec(f"figlet '_Conducto_ for Data Science'") with co.Parallel(name="LoadData", doc=LOAD_DOC) as load: load["Customer"] = co.Exec(PUT_CUSTOMER_DATA_CMD) load["Transaction"] = co.Exec(PUT_TRANSACTION_DATA_CMD) output["Join"] = co.Lazy(join_customer_transaction_data) output["Join"].doc = JOIN_DOC output["ComputeFeatures"] = co.Exec(COMPUTE_CMD, doc=COMPUTE_DOC) with co.Parallel(name="Models", doc=MODELS_DOC): for md in ["logistic", "random_forest", "gradient_boost"]: with co.Serial(name=md) as fit_and_test: fit_and_test["Fit"] = co.Exec(FIT_CMD.format(md=md)) fit_and_test["Backtest"] = co.Exec( BACKTEST_CMD.format(md=md)) output["Analyze"] = co.Exec(ANALYZE_CMD, doc=ANALYZE_DOC) return output
def hello_self() -> co.Serial: pipeline = co.Serial(image=examples_img, env={"PYTHONPATH": "."}) pipeline["Say Hi"] = co.Lazy(get_pipeline) return pipeline
def make_pipeline() -> co.Serial: root = co.Serial(image=img) root['fortune'] = co.Lazy(nodes_for_this_month) return root