def data_pipeline() -> co.Serial: """ `conducto-data-pipeline` is a pipeline-local key-value store. This data is only visible to your pipeline and persists until your pipeline is archived. One useful application is storing binaries in a build node, and retrieving them in a later test node. We exercise the `put` and `get` commands to do this. """ build_cmd = """set -ex go build -o bin/app ./app.go conducto-data-pipeline put --name my_app_binary --file bin/app """ test_cmd = """set -ex conducto-data-pipeline get --name my_app_binary --file /tmp/app /tmp/app --test """ # Dockerfile installs golang and conducto. dockerfile = "./docker/Dockerfile.data" image = co.Image(dockerfile=dockerfile, context=".", copy_dir="./code") with co.Serial(image=image, doc=co.util.magic_doc()) as build_and_test: co.Exec("conducto-data-pipeline --help", name="usage") co.Exec(build_cmd, name="build") co.Exec(test_cmd, name="test") return build_and_test
def disambiguate() -> co.Parallel: with co.Parallel(image=co.Image(copy_dir=".")) as node: # no ambiguity here, all kwargs refer to conducto.Node.__init__ co.Exec('''echo "node has 1.5 cpu's"''', name="A", cpu=1.5) # native method parameters come first # modify the node object in a second step, then connect it to its parent node_obj = co.Exec(myfunc, "DDR4-2933 (quad channel)", cpu=2950) node_obj.set(cpu=0.75, mem=1.5) node["B"] = node_obj # or connect it to its parent, then modify it in place node["C"] = co.Exec(myfunc, "DDR4-2667 (dual channel)") node["C"].set(cpu=0.75, mem=1.5) # some non-custom types don't have obvious string representations payload = {"foo": 2, "bar": 3} func(payload) # so you may have to handle the serialization yourself node["D"] = co.Exec(wrappedfunc, json.dumps(payload)) # custom types work, but you need to provide helpers param_obj = Emoticon(happy=True) node["E"] = co.Exec(describe, param_obj) return node
def run() -> co.Serial: image = co.Image("python:3.7", copy_branch="master", copy_url="https://github.com/liamcryan/ieuler.git") with co.Serial(image=image, doc=co.util.magic_doc()) as pipeline: co.Exec('pip install -r requirements.txt', name='build') co.Exec('pytest', name='tests') return pipeline
def get_image(): return co.Image("python:3.8-slim", copy_dir=".", reqs_packages=["wget"], reqs_py=[ "conducto", "numpy", "pandas", "scanpy", "ipdb", "leidenalg", "tabulate" ])
def primes_less_than(n: int) -> co.Serial: img = co.Image(copy_dir=".") with co.Serial(image=img) as root: root["find primes"] = co.Exec(sieve, n) return root
def _get_image(): return co.Image( "python:3.8-slim", copy_dir=".", reqs_py=[ "conducto", "boto3", "pandas", "sklearn", "matplotlib", "ipdb" ], )
def get_image(): return co.Image( "python:3.8-slim", copy_dir=".", reqs_py=["flask", "black"], reqs_packages=["curl", "vim"], reqs_docker=True, )
def primes_less_than(n) -> co.Serial: n = int(n) img = co.Image(copy_dir=".") with co.Serial(same_container=co.SameContainer.NEW, image=img) as root: root["find primes"] = co.Exec(f"python sieve.py {n}") if n >= 3: root["check distribution"] = co.Exec(f"cat primes | python check.py {n}") root["is 2 included?"] = co.Exec("egrep '^2$' primes") return root
def run() -> co.Serial: cfg = configparser.ConfigParser() cfg.read('config.ini'); # work config params (reps) reps = cfg['params']['replicates'] print(f'running with {reps} replicates') image = co.Image(image="gbly/miniconda3", copy_dir=".", reqs_py=['conducto==0.0.67']) with co.Serial(image=image, doc=co.util.magic_doc()) as pipeline: #pipeline["python_trial"] = co.Exec("python -c 'import pandas as pd'") pipeline["parallel_experiment"] = co.Lazy(parallelize_reps, reps=int(reps)) pipeline["plot_data"] = co.Exec(plot_reps, reps=int(reps)) return pipeline
def build(projects: typing.List[str]) -> co.Parallel: "Build projects in parallel, using simple shell command." # Override the parent image to use one with docker installed. img = co.Image(image="docker:19.03", copy_dir=".") output = co.Parallel(image=img, requires_docker=True) for project in projects: # Command needs docker; inherits flag from parent node output[project] = co.Exec(f"cd {project} && docker build .") return output
def pr(branch) -> co.Parallel: # Make a Docker image, based on python:alpine, with the whole repo and the contents # of the given branch. image = co.Image("python:alpine", copy_repo=True, copy_branch=branch) # Using that Docker image, run three commands in parallel to interact with the # repo's files. with co.Parallel(image=image) as root: co.Exec(f"echo {branch}", name="print branch") co.Exec("pwd", name="print working directory") co.Exec("ls -la", name="list files") return root
def build_and_test() -> co.Serial: image = co.Image(copy_dir="./code") with co.Serial(image=image, stop_on_error=False) as pipeline: with co.Parallel(name="Trade") as first_trading: first_trading['US'] = co.Exec("python3 first_stock_trading.py") first_trading['CHINA'] = co.Exec("python3 second_stock_trading.py") with co.Parallel(name="TopK") as second_trading: second_trading['US'] = co.Exec( "python3 first_topK_stock_pipeline.py") second_trading['CHINA'] = co.Exec( "python3 second_topK_stock_pipeline.py") return pipeline
def download_and_plot() -> co.Serial: download_command = """ apt update -y && apt install -y curl unzip curl https://www.fs.usda.gov/rds/archive/products/RDS-2005-0004/RDS-2005-0004.zip > data.zip unzip data.zip """ image = co.Image(dockerfile='./Dockerfile', context='.') with co.Serial(image=image) as pipeline: co.Exec(download_command, name="download") with co.Parallel(name='plot'): co.Exec('python rainfall.py', name='daily') co.Exec('python rainfall.py --resample M --save', name='monthly') return pipeline
def get(cls, image=None, dockerfile=None, reqs_py=None, name=None): assert image is not None or dockerfile is not None return co.Image( image=image, dockerfile=dockerfile, context=cls.context, reqs_py=reqs_py, copy_dir=cls.copy_dir, copy_url=cls.copy_url, copy_branch=cls.copy_branch, path_map=cls.path_map, name=name, )
def main() -> co.Serial: image = co.Image(dockerfile="./Dockerfile", copy_dir=".") with co.Serial(doc=__doc__, image=image, env=get_env()) as root: with co.Parallel(name="access check") as access_check: access_check["Heroku"] = co.Exec(TEST_HEROKU) access_check["RedisLabs"] = co.Exec(TEST_REDIS) root["deploy"] = deploy() root["integration test"] = co.Exec(INTEGRATION_TEST) with co.Parallel(name="teardown") as teardown: teardown["clear data"] = co.Exec(CLEAR_DATA) teardown["stop"] = co.Exec(STOP_APP) teardown["destroy"] = co.Exec(DESTROY_APP) return root
def ci_cd(projects=utils.get_projects()) -> co.Serial: "Build all projects, run tests if builds succeed, then deploy if tests pass" # User the standard python 3.8 image as a base and add all files from # the current dir. We also need to install conducto in the image in # order to dynamically generate the tree with Lazy in test(). img = co.Image(image="python:3.8", copy_dir=".", reqs_py=["conducto"]) output = co.Serial(image=img) output["Build"] = build(projects) output["Test"] = test(projects) output["Deploy"] = co.Exec("echo aws cloudformation deploy") return output
def main() -> co.Serial: img = co.Image(dockerfile="./Dockerfile", reqs_docker=True) with co.Serial(image=img, env=get_env(), doc=__doc__) as root: root["Check AWS Creds"] = co.Exec(CHECK_AWS_CREDS) with co.Parallel(name="Init", doc=INIT_DOC) as init: init["Deploy Infra"] = deploy_infra() init["Deploy Image"] = deploy_image() init["Lint"] = co.Exec("black --check .") init["Unit Test"] = co.Exec("python service/test.py --verbose") root["Deploy Service"] = deploy_service() root["Integration Test"] = co.Exec(INTEGRATION_CMD, doc=INTEGRATION_DOC) root["Cleanup"] = cleanup() return root
def pipeline() -> co.Serial: # defer node definition until the first node runs root = co.Lazy(nodes_for_this_month) # conducto installs the dependencies into its image root.image = co.Image( copy_url="https://github.com/MatrixManAtYrService/sandboxen", copy_branch="master", path_map={".": "./fortune_witherror"}, reqs_py=["conducto", "sh"], reqs_packages=["fortune"], ) return root
def run(branch: str) -> co.Serial: image = co.Image(image="python:3.6", reqs_py=["conducto"]) root = co.Serial(image=image) with co.Serial(same_container=co.SameContainer.NEW, cpu=12, mem=32) as build: build["fetch"] = co.Exec("echo im fetching") build["checkout"] = co.Exec("echo im checking out") with co.Parallel(name="checks") as checks: checks["yapf"] = co.Exec("echo checking yapf") checks["python_tests"] = co.Exec( "echo checking python tests") checks["flake8"] = co.Exec( "echo checking flake8") checks["pylint"] = co.Exec( "echo im checking pylint") checks["mypy"] = co.Exec("echo im checking mypy") checks["cppcheck"] = co.Exec( "echo im checking cppcheck") checks["clang_format"] = co.Exec( "echo im checking clang_format") build["build"] = co.Exec('echo im building now') root["build"] = build auth_token = co.api.Auth().get_token_from_shell() access_token = co.api.Secrets().get_user_secrets( auth_token)["GITHUB_ACCESS_TOKEN"] stdout = subprocess.check_output( f"git ls-remote [email protected]:jmazar/conduco_statuses.git refs/heads/{branch} | cut -f1", shell=True) sha = stdout.decode("utf-8").strip() print(sha) print(access_token) creator = co.callback.github_status_creator( owner="jmazar", repo="conduco_statuses", sha=sha, access_token=access_token, ) for node in root.stream(): if isinstance(node, co.Exec): node.on_queued(creator(state="pending")) node.on_done(creator(state="success")) node.on_error(creator(state="failure")) return root
def pipeline() -> co.Parallel: """ Customize an Image """ # the previous example passed a string via the `image` node parameter # # root["world]"] = co.Exec(f"{cmd}", # image="node:current-alpine") # ^ # like this --------------------------┘ # pass co.Image objects for more options img = co.Image( name="my-shared-image", image="node:current-alpine", reqs_packages=["tree"], copy_url="https://github.com/conducto/conducto", copy_branch="main", ) # set the `image` node parameter on a parent # the children will inherit the value root = co.Parallel(image=img) # this node runs ok root["Look around"] = co.Exec(""" tree -L 2 ; find . -name 'hello_.*' """) # these nodes have problems root["Hi from Node"] = co.Exec("node hello_py_js/hello.js") root["Hi from C++"] = co.Exec(""" g++ hello_cpp/hello.cpp -o hello ; ./hello """) # explore the pipeline to understand them # change this file, and relaunch the pipeline to see results root = tour.guide(root) return root
def data_pipeline() -> co.Serial: """ ### **`co.data.pipeline`** `co.data.pipeline` is a pipeline-local key-value store. This data is only visible to your pipeline and persists until your pipeline is deleted. It is useful for writing data in one pipeline step, and reading it in another. `co.data.pipeline` has both a python and command line interface as `conducto-data-pipeline`. The first node of the example prints the command line usage to show the full interface. ### Example: Parameter Search One useful application is performing and summarizing a parameter search. In this example, we try different parameterizations of an algorithm in parallel. Each one stores its results using `co.data.pipeline.puts()`. Once all of the parallel tasks are done, it reads the results using `co.data.pipeline.gets()` and prints a summary. """ # Dockerfile installs python, R, and conducto. image = co.Image(dockerfile="docker/Dockerfile.data", context=".", copy_dir="./code", reqs_py=["conducto"]) data_dir = "demo/data_science/data" output = co.Serial(image=image, doc=co.util.magic_doc()) output["usage"] = co.Exec("conducto-data-pipeline --help") output["parameter_search"] = ps = co.Parallel() for window in [25, 50, 100]: ps[f"window={window}"] = w = co.Parallel() for mean in [.05, .08, .11]: w[f"mean={mean}"] = m = co.Parallel() for volatility in [.1, .125, .15, .2]: m[f"volatility={volatility}"] = co.Exec( f"python data.py --window={window} --mean={mean} " f"--volatility={volatility} --data-dir={data_dir}") output["summarize"] = co.Exec(f"Rscript data.R {data_dir}") return output
def cicd() -> co.Serial: image = co.Image("node:current-alpine", copy_url="https://github.com/flippedcoder/its-hot", copy_branch="master") install_node = co.Exec("npm i") test_node = co.Exec("CI=true; npm test") build_node = co.Exec("npm build") deploy_node = co.Exec("echo secret stuff to deploy to an S3 bucket on AWS") pipeline = co.Serial(image=image, same_container=co.SameContainer.NEW) pipeline["Install dependencies..."] = install_node pipeline["Running tests..."] = test_node pipeline["Build project..."] = build_node pipeline["Deploy project..."] = deploy_node return pipeline
def main() -> co.Serial: retry = co.Exec(fail_then_pass, "retry", 2) retry.on_error(co.callback.retry(3)) retry_2 = co.Exec(fail_then_pass, "retry2", 3) retry_2.on_error(co.callback.retry(2)) retry_then_skip = co.Exec(fail_then_pass, "retry_then_skip", 3) retry_then_skip.on_error(co.callback.retry_then_skip(2)) retry_then_skip_2 = co.Exec(fail_then_pass, "retry_then_skip", 2) retry_then_skip_2.on_error(co.callback.retry_then_skip(3)) retry_with_double_mem = co.Exec(fail_then_pass, "retry_with_double_mem", 2) retry_with_double_mem.on_error(co.callback.retry_with_double_mem(3)) handle_memory_errors = co.Exec(fail_then_pass, "retry_with_double_mem", 0) handle_memory_errors.on_error(co.callback.handle_memory_errors()) skip_some_errors = co.Serial(stop_on_error=False) skip_some_errors['pass'] = co.Exec('echo hi') skip_some_errors['fail1'] = co.Exec('echo hi | grep foo') skip_some_errors['fail2'] = co.Exec('echo hi | grep bar') skip_some_errors.on_error(co.callback.skip_some_errors(2)) with co.Serial(image=co.Image(copy_dir=".", reqs_py=["conducto"]), stop_on_error=False) as node: node["retry"] = retry node["retry_2"] = retry_2 node["retry_then_skip"] = retry_then_skip node["retry_then_skip_2"] = retry_then_skip_2 node["retry_with_double_mem"] = retry_with_double_mem node["skip_some_errors"] = skip_some_errors node["handle_memory_errors"] = handle_memory_errors node.on_done(co.callback.email(to="*****@*****.**")) return node
{df.transpose().round(2).to_markdown()} </ConductoMarkdown> """) ############################################################ # Constants and globals ############################################################ DATA_PATH = "/conducto/data/pipeline/steo.txt" DATASETS = { "Heating Degree Days": r"^STEO.ZWHD_[^_]*\.M$", "Cooling Degree Days": r"^STEO.ZWCD_[^_]*.M$", "Electricity Generation": r"^STEO.NGEPGEN_[^_]*\.M$", } IMG = co.Image("python:3.8", copy_dir=".", reqs_py=["conducto", "pandas", "matplotlib", "tabulate"]) # Data is downloaded from the United States Energy Information Administration. # https://www.eia.gov/opendata/bulkfiles.php DOWNLOAD_COMMAND = f""" echo "Downloading" curl http://api.eia.gov/bulk/STEO.zip > steo.zip unzip -cq steo.zip > {DATA_PATH} """.strip() if __name__ == "__main__": co.main(default=run)
import conducto as co from inspect import cleandoc import sys # Docker Images ############### # for playing the game tick-at-a-time game_of_life = co.Image(dockerfile='conway/Dockerfile', context='conway') # Command Templates ################### # for all commands, use strict mode so that errors draw attention header = "set -euo pipefail" # create the start state and stash it initialize_grid = cleandoc(''' {header} to_grid '0010000000 0010000011 0010100011 0100010000 0101110001 0100000001 0001000111 0010100000 0101010010 0010011000' > grid.json # store it as the only item in a list (subsequent grids coming soon)
import conducto as co python_img = co.Image( image="python:3.8-alpine", copy_url="https://github.com/leachim6/hello-world", copy_branch="master", path_map={"./local-copy/p": "p"}, ) def hello() -> co.Serial: pipeline = co.Serial() pipeline["Say Hi"] = co.Exec("python p/python3.py", image=python_img) return pipeline if __name__ == "__main__": co.main(default=hello)
import conducto as co import json from pathlib import Path # commands.py and experiment.py are in the same folder as this file from experiment import genomes, genes img = co.Image( image="ncbi/blast", # use the BLAST image published by ncbi on dockerhub copy_dir=".", # add this directory reqs_py=["conducto", "biopython", "pandas"], reqs_packages=["wget", "gzip"], ) data_dir = "/conducto/data/pipeline" def download_file(source_url, target_path) -> co.Serial: "Returns a serial node which downloads a gzipped FASTA file" target_dir = Path(target_path).parent node = co.Serial() node["Download"] = co.Exec( f"mkdir -p {target_dir} && wget -O {target_path}.gz {source_url}") node["Decompress"] = co.Exec(f"gunzip -c {target_path}.gz > {target_path}") return node def analyze(hits):
with co.Serial(stop_on_error=False, doc=__doc__) as root: # inner context: stop on errors # don't bother testing a failed deployment with co.Serial(name="run", stop_on_error=True) as run: run["deploy"] = deploy() run["test"] = test() # stop services root["clean up"] = teardown() return root # test tools test_img = co.Image(reqs_packages=["redis", "curl"], reqs_py=["conducto"]) def test() -> co.Serial: """ Check if both redis and flask are available. Then see if they're working. """ with co.Serial(image=test_img) as test: with co.Parallel(name="services up?") as check: check["redis up?"] = co.Exec(TEST_REDIS_CMD) check["flask up?"] = co.Exec(TEST_FLASK_CMD) test["integration test"] = co.Exec(INTEGRATION_TEST_CMD)
import conducto as co img = co.Image(dockerfile="Dockerfile") def hello() -> co.Serial: with co.Serial(image=img) as pipeline: pipeline["Say Hi"] = co.Exec("pokemonsay -pn Oddish 'Hi'") return pipeline if __name__ == "__main__": co.main(default=hello)
import conducto as co IMG = co.Image() def main() -> co.Serial: with co.Serial(image=IMG, requires_docker=True) as root: with co.Parallel(name="Init") as init: init["Build"] = co.Exec("sleep 3") init["Lint"] = co.Exec("sleep 1") init["Unit Test"] = co.Exec("sleep 1.5") root["Deploy"] = co.Exec("sleep 4") root["Integration Test"] = co.Exec("sleep 2") return root if __name__ == "__main__": co.main(default=main)