def _redis_wrapper() -> co.Serial: """ This is a simple wrapper that starts and stops a local redis instance around our *redis_data_store* example. This is just to mock a real redis service you might have running externally. The details of how this works are not critical right now. We use Conducto features `stop_on_error` and `requires_docker` that are discussed in a later tutorial. **Focus on the *redis_data_store* node for now.** """ name = "conducto_demo_redis" mock_redis_start_cmd = f"""set -ex docker run -p 6379:6379 -d --rm --name {name} redis:5.0-alpine sleep 1 # wait for redis to start up docker logs --details {name} # error if redis container not running docker inspect {name} --format="{{{{.State.Running}}}}" """ mock_redis_stop_cmd = f"docker stop {name} || true" with co.Serial( image="docker:19.03", stop_on_error=False, requires_docker=True, doc=co.util.magic_doc(doc_only=True), ) as wrapper: co.Exec(mock_redis_start_cmd, name="mock_redis_start") wrapper["redis_data_store"] = redis_data_store() co.Exec(mock_redis_stop_cmd, name="mock_redis_stop") return wrapper
def disambiguate() -> co.Parallel: with co.Parallel(image=co.Image(copy_dir=".")) as node: # no ambiguity here, all kwargs refer to conducto.Node.__init__ co.Exec('''echo "node has 1.5 cpu's"''', name="A", cpu=1.5) # native method parameters come first # modify the node object in a second step, then connect it to its parent node_obj = co.Exec(myfunc, "DDR4-2933 (quad channel)", cpu=2950) node_obj.set(cpu=0.75, mem=1.5) node["B"] = node_obj # or connect it to its parent, then modify it in place node["C"] = co.Exec(myfunc, "DDR4-2667 (dual channel)") node["C"].set(cpu=0.75, mem=1.5) # some non-custom types don't have obvious string representations payload = {"foo": 2, "bar": 3} func(payload) # so you may have to handle the serialization yourself node["D"] = co.Exec(wrappedfunc, json.dumps(payload)) # custom types work, but you need to provide helpers param_obj = Emoticon(happy=True) node["E"] = co.Exec(describe, param_obj) return node
def redis_data_store() -> co.Exec: """ There are many standard ways to store persistent data: databases, AWS S3, and in-memory caches like redis. An exec node can run any shell command, so it is easy to use any of these approaches. Here we populate environment variables pointing to our redis service, allowing us to write to and read from redis in a python script. """ # export_cmd is just a hack to set REDIS_HOST to our mock instance export_cmd = ( "export REDIS_HOST=$(ip route show default | awk '/default/{print $3}')" ) redis_write_cmd = f"{export_cmd} && python code/redis_example.py --write" redis_read_cmd = f"{export_cmd} && python code/redis_example.py --read" env = { "REDIS_HOST": "override_me", "REDIS_PORT": "6379", } with co.Serial(image=utils.IMG, env=env, doc=co.util.magic_doc()) as redis_store: co.Exec(redis_write_cmd, name="redis_write") co.Exec(redis_read_cmd, name="redis_read") return redis_store
def life() -> co.Serial: with co.Serial(image=game_of_life) as pipeline: pipeline["initialize grid"] = co.Exec(initialize_grid) image_names = [] # TODO: instead of modeling a fixed number of clock ticks # use a lazy node to extend this until a grid state is repeated for tick in ticks: with co.Serial(name=f"tick {tick}", image=game_of_life) as iteration: iteration["show grid"] = co.Exec(show_grid(tick)) iteration["find neighbors"] = co.Exec(find_neighborhoods(tick)) with co.Parallel(name=f"apply_rules", image=game_of_life) as rules: rules["isolate"] = co.Exec(isolate(tick)) rules["survive"] = co.Exec(survive(tick)) rules["crowd"] = co.Exec(crowd(tick)) rules["reproduce"] = co.Exec(reproduce(tick)) rules["ignore"] = co.Exec(ignore(tick)) iteration["next grid"] = co.Exec(next_grid(tick)) image_names.append(f"image_{tick}.png") image_list = " ".join(image_names) pipeline["animate"] = co.Exec(animate(image_list)) return pipeline
def deploy_image() -> co.Serial: with co.Serial() as output: co.Exec(CREATE_REPO_CMD, name="Create Repo") co.Exec(BUILD_AND_PUSH_CMD, name="Build and Push", requires_docker=True) return output
def data_pipeline() -> co.Serial: """ `conducto-data-pipeline` is a pipeline-local key-value store. This data is only visible to your pipeline and persists until your pipeline is archived. One useful application is storing binaries in a build node, and retrieving them in a later test node. We exercise the `put` and `get` commands to do this. """ build_cmd = """set -ex go build -o bin/app ./app.go conducto-data-pipeline put --name my_app_binary --file bin/app """ test_cmd = """set -ex conducto-data-pipeline get --name my_app_binary --file /tmp/app /tmp/app --test """ # Dockerfile installs golang and conducto. dockerfile = "./docker/Dockerfile.data" image = co.Image(dockerfile=dockerfile, context=".", copy_dir="./code") with co.Serial(image=image, doc=co.util.magic_doc()) as build_and_test: co.Exec("conducto-data-pipeline --help", name="usage") co.Exec(build_cmd, name="build") co.Exec(test_cmd, name="test") return build_and_test
def make_compute_features_node(in_dir, tmp_dir, out_file, start_date="00000000") -> co.Serial: """ Builds a tree for computing features. Parallelize over different months. """ all_files = glob.glob(f"{in_dir}/*.csv") all_yyyymms = sorted({os.path.basename(f)[:-4] for f in all_files}) os.makedirs(tmp_dir, exist_ok=True) # Skip the first month because we need 1 month of history to compute features all_yyyymms = all_yyyymms[1:] # Then subset to only the ones beyond the start date all_yyyymms = [ yyyymm for yyyymm in all_yyyymms if yyyymm >= start_date[:6] ] # Make output output = co.Serial() output["Parallelize"] = co.Parallel() for node, yyyymm in co.util.makeyyyymmnodes(output["Parallelize"], all_yyyymms): node[yyyymm] = co.Exec(compute_features, yyyymm, in_dir, tmp_dir) output["Merge"] = co.Exec(merge_data, tmp_dir, out_file) return output
def deploy_infra() -> co.Serial: vpc_cmd = DEPLOY_STACK_CMD.format(stack="vpc") elb_cmd = DEPLOY_STACK_CMD.format(stack="elb") with co.Serial() as output: co.Exec(vpc_cmd, name="VPC") co.Exec(elb_cmd, name="ELB") return output
def poll_sensors() -> co.Serial: r = co.Serial() r['/pmt'] = co.Serial() r['/pmt/poll'] = co.Parallel(image=img) for name in range(1104): if name == 1002: # presumably this sensor is broken somehow r[f'/pmt/poll/{name}'] = co.Exec(certain, 1) else: # most of the sensors work just fine r[f'/pmt/poll/{name}'] = co.Exec(certain, 0) run_callback = co.callback.slack_status(recipient="SlackUser", message="polling sensors") r.on_running(run_callback) err_callback = co.callback.slack_status(recipient="#array-status", ) r.on_error(err_callback) done_callback = co.callback.slack_status( recipient="#array-status", message="all sensors reporting nominally", ) r.on_done(done_callback) # other events include: # - on_queued # - on_running # - on_killed # - on_state_change return r
def run() -> co.Serial: image = co.Image("python:3.7", copy_branch="master", copy_url="https://github.com/liamcryan/ieuler.git") with co.Serial(image=image, doc=co.util.magic_doc()) as pipeline: co.Exec('pip install -r requirements.txt', name='build') co.Exec('pytest', name='tests') return pipeline
def teardown(): """ Stop containers. """ with co.Parallel(image=docker_img, requires_docker=True) as node: node["stop redis"] = co.Exec(STOP_REDIS_CMD) node["stop flask"] = co.Exec(STOP_FLASK_CMD) return node
def main() -> co.Parallel: with co.Parallel(image=IMG) as root: # Count lines of code in the remote Git repo. root["lines of code"] = co.Exec("cloc .") # Run a simple data analysis script located there. root["biggest US cities"] = co.Exec( "cd features/copy_url && python analyze.py cities.csv") return root
def main() -> co.Serial: with co.Serial(image=IMG, requires_docker=True) as root: with co.Parallel(name="Init") as init: init["Build"] = co.Exec("sleep 3") init["Lint"] = co.Exec("sleep 1") init["Unit Test"] = co.Exec("sleep 1.5") root["Deploy"] = co.Exec("sleep 4") root["Integration Test"] = co.Exec("sleep 2") return root
def main() -> co.Serial: with co.Serial(image=IMG, requires_docker=True) as root: with co.Parallel(name="Init") as init: init["Build"] = co.Exec("docker build .") init["Lint"] = co.Exec("black --check .") init["Unit Test"] = co.Exec("python unit_test.py") root["Deploy"] = co.Exec("bash deploy_aws.sh") root["Integration Test"] = co.Exec("bash int_test.sh") return root
def cleanup() -> co.Serial: delete_service_cmd = DELETE_STACK_CMD.format(stack="service") delete_elb_cmd = DELETE_STACK_CMD.format(stack="elb") delete_vpc_cmd = DELETE_STACK_CMD.format(stack="vpc") with co.Serial(skip=True, doc=CLEANUP_DOC) as output: co.Exec(delete_service_cmd, name="Service") co.Exec(delete_elb_cmd, name="ELB") co.Exec(delete_vpc_cmd, name="VPC") co.Exec(DELETE_REPO_CMD, name="Repo") return output
def primes_less_than(n) -> co.Serial: n = int(n) img = co.Image(copy_dir=".") with co.Serial(same_container=co.SameContainer.NEW, image=img) as root: root["find primes"] = co.Exec(f"python sieve.py {n}") if n >= 3: root["check distribution"] = co.Exec(f"cat primes | python check.py {n}") root["is 2 included?"] = co.Exec("egrep '^2$' primes") return root
def hello() -> co.Serial: # Reuse the "build" container for the "test" node # so that the binary is available in the second node. with co.Serial(image=img, container_reuse_context=CRC.NEW, doc=co.util.magic_doc(comment=True)) as root: co.Exec("g++ hello.cpp -o hello", name="build") co.Exec("./hello | grep 'World!'", name="test") return root
def download_file(source_url, target_path) -> co.Serial: "Returns a serial node which downloads a gzipped FASTA file" target_dir = Path(target_path).parent node = co.Serial() node["Download"] = co.Exec( f"mkdir -p {target_dir} && wget -O {target_path}.gz {source_url}") node["Decompress"] = co.Exec(f"gunzip -c {target_path}.gz > {target_path}") return node
def pr(branch) -> co.Parallel: # Make a Docker image, based on python:alpine, with the whole repo and the contents # of the given branch. image = co.Image("python:alpine", copy_repo=True, copy_branch=branch) # Using that Docker image, run three commands in parallel to interact with the # repo's files. with co.Parallel(image=image) as root: co.Exec(f"echo {branch}", name="print branch") co.Exec("pwd", name="print working directory") co.Exec("ls -la", name="list files") return root
def main() -> co.Serial: with co.Serial(image=img) as p: # p is for 'Pipeline root' p["get data"] = co.Exec(get_sensor_data) p["notify"] = co.Parallel() p["notify/stdout"] = co.Exec(plot_to_stdout) p["notify/channel"] = co.Exec(plot_to_slack) p["notify/team"] = co.Serial() for user in update_users: p[f"notify/team/{user}"] = co.Exec(message_to_slack_user, user) return p
def path() -> co.Serial: """ The Node tree can be accessed with file system-like [paths](/docs/basics/pipeline-structure#path). """ root = co.Serial(image="foo", doc=co.util.magic_doc()) root["all together"] = co.Parallel() root["all together/a"] = co.Exec("echo step 1, image bar", image="bar") root["all together/b"] = co.Exec("echo step 1, image foo") root["one at a time"] = co.Serial(image="bar") root["one at a time/c"] = co.Exec("echo step 2, image bar") root["one at a time/d"] = co.Exec("echo step 3, image bar") return root
def dict() -> co.Serial: """ Each Node is [dict-like](/docs/basics/pipeline-structure#dict), and you can build a hierarchy by assigning children into them. """ root = co.Serial(image="foo", doc=co.util.magic_doc()) root["all together"] = co.Parallel() root["all together"]["a"] = co.Exec("echo step 1, image bar", image="bar") root["all together"]["b"] = co.Exec("echo step 1, image foo") root["one at a time"] = co.Serial(image="bar") root["one at a time"]["c"] = co.Exec("echo step 2, image bar") root["one at a time"]["d"] = co.Exec("echo step 3, image bar") return root
def download_and_plot() -> co.Serial: download_command = """ apt update -y && apt install -y curl unzip curl https://www.fs.usda.gov/rds/archive/products/RDS-2005-0004/RDS-2005-0004.zip > data.zip unzip data.zip """ image = co.Image(dockerfile='./Dockerfile', context='.') with co.Serial(image=image) as pipeline: co.Exec(download_command, name="download") with co.Parallel(name='plot'): co.Exec('python rainfall.py', name='daily') co.Exec('python rainfall.py --resample M --save', name='monthly') return pipeline
def build_and_test() -> co.Serial: image = co.Image(copy_dir="./code") with co.Serial(image=image, stop_on_error=False) as pipeline: with co.Parallel(name="Trade") as first_trading: first_trading['US'] = co.Exec("python3 first_stock_trading.py") first_trading['CHINA'] = co.Exec("python3 second_stock_trading.py") with co.Parallel(name="TopK") as second_trading: second_trading['US'] = co.Exec( "python3 first_topK_stock_pipeline.py") second_trading['CHINA'] = co.Exec( "python3 second_topK_stock_pipeline.py") return pipeline
def main() -> co.Serial: img = co.Image(dockerfile="./Dockerfile", reqs_docker=True) with co.Serial(image=img, env=get_env(), doc=__doc__) as root: root["Check AWS Creds"] = co.Exec(CHECK_AWS_CREDS) with co.Parallel(name="Init", doc=INIT_DOC) as init: init["Deploy Infra"] = deploy_infra() init["Deploy Image"] = deploy_image() init["Lint"] = co.Exec("black --check .") init["Unit Test"] = co.Exec("python service/test.py --verbose") root["Deploy Service"] = deploy_service() root["Integration Test"] = co.Exec(INTEGRATION_CMD, doc=INTEGRATION_DOC) root["Cleanup"] = cleanup() return root
def test() -> co.Serial: """ Check if both redis and flask are available. Then see if they're working. """ with co.Serial(image=test_img) as test: with co.Parallel(name="services up?") as check: check["redis up?"] = co.Exec(TEST_REDIS_CMD) check["flask up?"] = co.Exec(TEST_FLASK_CMD) test["integration test"] = co.Exec(INTEGRATION_TEST_CMD) return test
def context() -> co.Serial: """ You can use [context managers](/docs/basics/pipeline-structure#context) (Python's `with` statement) to add children. This lets you use whitespace to express node depth. """ with co.Serial(image=foo, doc=co.util.magic_doc()) as root: with co.Parallel(name="all together"): co.Exec("echo step 1, image bar", name="a", image=bar) co.Exec("echo step 1, image foo", name="b") with co.Serial(name="one at a time", image=bar) as two: co.Exec("echo step 2, image bar", name="c") co.Exec("echo step 3, image bar", name="d") return root
def parallelize_reps(reps:int) -> co.Parallel: output = co.Parallel() data_size = reps min_rep = 0 max_rep = reps for rep_i in range(min_rep,max_rep): print("inside rep " + str(rep_i)) output[f'rep{rep_i}'] = co.Serial() # unpredictable output[f'rep{rep_i}']['p1'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 1 && conducto-perm-data put --name rep{rep_i}p1 --file LOD_data.csv") # predictable output[f'rep{rep_i}']['p0'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 0 && conducto-perm-data put --name rep{rep_i}p0 --file LOD_data.csv") return output
def run() -> co.Serial: "Download data from the US EIA, then visualize some datasets." with co.Serial(image=IMG, doc=co.util.magic_doc()) as output: # First download some data from the US Energy Information Administration. output["Download"] = co.Exec(DOWNLOAD_COMMAND) # Then make a few different visualizations of it. output["Display"] = co.Parallel() for dataset in DATASETS.keys(): # Use co.Exec shorthand for calling native Python functions. # It calls `display(dataset)` in an Exec node. It's equal to: # python pipeline.py display --dataset={dataset} output["Display"][dataset] = co.Exec(display, dataset) return output
def pipeline() -> co.Parallel: root = co.Parallel() root["one"] = co.Exec( cleandoc(""" docker run --rm \\ -e HEROKU_API_KEY='88d1c57c-c074-4333-9004-56f1b6b32e11' \\ dickeyxxx/heroku-cli \\ heroku apps """), requires_docker=True, image="docker:latest", ) root["two"] = co.Exec("heroku apps", env=env, image="dickeyxxx/heroku-cli") return root