Ejemplo n.º 1
0
def backtest_node(model_path, in_glob, out_dir) -> co.Parallel:
    import glob
    import os

    in_files = sorted(glob.glob(in_glob))

    output = co.Parallel()
    for f in in_files:
        # Input: RAW_DATA_DIR/validate3554.tfrecord
        # Output: OUT_DIR/validate3554.pkl.gzip
        base = os.path.basename(f).replace(".tfrecord", "")
        out_path = os.path.join(out_dir, base + ".pkl.gzip")

        if len(in_files) > 50:
            import re
            parent = re.sub("(\d\d)\d\d", "\\1__", base)
            if parent not in output:
                output[parent] = co.Parallel()
            base = f"{parent}/{base}"

        output[base] = co.Exec(commands.backtest,
                               model_path=model_path,
                               data_path=f,
                               out_path=out_path)
    return output

    return output
Ejemplo n.º 2
0
def test(projects: typing.List[str]) -> co.Parallel:
    "Group tests by project, all in parallel."
    output = co.Parallel()
    for project in projects:
        output[project] = co.Parallel()
        for name in utils.get_tests(project):
            # co.Exec often accepts a command string. In this case it takes (func, *args, **kwargs),
            output[project][name] = co.Exec(utils.run_test, project, test=name)
    return output
Ejemplo n.º 3
0
def build_and_test() -> co.Serial:
    image = co.Image(copy_dir="./code")
    with co.Serial(image=image, stop_on_error=False) as pipeline:
        with co.Parallel(name="Trade") as first_trading:
            first_trading['US'] = co.Exec("python3 first_stock_trading.py")
            first_trading['CHINA'] = co.Exec("python3 second_stock_trading.py")
        with co.Parallel(name="TopK") as second_trading:
            second_trading['US'] = co.Exec(
                "python3 first_topK_stock_pipeline.py")
            second_trading['CHINA'] = co.Exec(
                "python3 second_topK_stock_pipeline.py")

    return pipeline
Ejemplo n.º 4
0
def main() -> co.Serial:
    image = co.Image(dockerfile="./Dockerfile", copy_dir=".")
    with co.Serial(doc=__doc__, image=image, env=get_env()) as root:
        with co.Parallel(name="access check") as access_check:
            access_check["Heroku"] = co.Exec(TEST_HEROKU)
            access_check["RedisLabs"] = co.Exec(TEST_REDIS)
        root["deploy"] = deploy()
        root["integration test"] = co.Exec(INTEGRATION_TEST)
        with co.Parallel(name="teardown") as teardown:
            teardown["clear data"] = co.Exec(CLEAR_DATA)
            teardown["stop"] = co.Exec(STOP_APP)
            teardown["destroy"] = co.Exec(DESTROY_APP)
    return root
Ejemplo n.º 5
0
def make_compute_features_node(in_dir,
                               tmp_dir,
                               out_file,
                               start_date="00000000") -> co.Serial:
    """
    Builds a tree for computing features. Parallelize over different months.
    """
    all_files = glob.glob(f"{in_dir}/*.csv")
    all_yyyymms = sorted({os.path.basename(f)[:-4] for f in all_files})

    os.makedirs(tmp_dir, exist_ok=True)

    # Skip the first month because we need 1 month of history to compute features
    all_yyyymms = all_yyyymms[1:]

    # Then subset to only the ones beyond the start date
    all_yyyymms = [
        yyyymm for yyyymm in all_yyyymms if yyyymm >= start_date[:6]
    ]

    # Make output
    output = co.Serial()
    output["Parallelize"] = co.Parallel()
    for node, yyyymm in co.util.makeyyyymmnodes(output["Parallelize"],
                                                all_yyyymms):
        node[yyyymm] = co.Exec(compute_features, yyyymm, in_dir, tmp_dir)
    output["Merge"] = co.Exec(merge_data, tmp_dir, out_file)
    return output
Ejemplo n.º 6
0
def life() -> co.Serial:

    with co.Serial(image=game_of_life) as pipeline:

        pipeline["initialize grid"] = co.Exec(initialize_grid)

        image_names = []
        # TODO: instead of modeling a fixed number of clock ticks
        # use a lazy node to extend this until a grid state is repeated
        for tick in ticks:
            with co.Serial(name=f"tick {tick}",
                           image=game_of_life) as iteration:

                iteration["show grid"] = co.Exec(show_grid(tick))
                iteration["find neighbors"] = co.Exec(find_neighborhoods(tick))

                with co.Parallel(name=f"apply_rules",
                                 image=game_of_life) as rules:

                    rules["isolate"] = co.Exec(isolate(tick))
                    rules["survive"] = co.Exec(survive(tick))
                    rules["crowd"] = co.Exec(crowd(tick))
                    rules["reproduce"] = co.Exec(reproduce(tick))
                    rules["ignore"] = co.Exec(ignore(tick))

                iteration["next grid"] = co.Exec(next_grid(tick))

            image_names.append(f"image_{tick}.png")

        image_list = " ".join(image_names)
        pipeline["animate"] = co.Exec(animate(image_list))

    return pipeline
Ejemplo n.º 7
0
def poll_sensors() -> co.Serial:

    r = co.Serial()
    r['/pmt'] = co.Serial()
    r['/pmt/poll'] = co.Parallel(image=img)
    for name in range(1104):

        if name == 1002:
            # presumably this sensor is broken somehow
            r[f'/pmt/poll/{name}'] = co.Exec(certain, 1)
        else:

            # most of the sensors work just fine
            r[f'/pmt/poll/{name}'] = co.Exec(certain, 0)

    run_callback = co.callback.slack_status(recipient="SlackUser",
                                            message="polling sensors")
    r.on_running(run_callback)

    err_callback = co.callback.slack_status(recipient="#array-status", )
    r.on_error(err_callback)

    done_callback = co.callback.slack_status(
        recipient="#array-status",
        message="all sensors reporting nominally",
    )
    r.on_done(done_callback)

    # other events include:
    # - on_queued
    # - on_running
    # - on_killed
    # - on_state_change

    return r
Ejemplo n.º 8
0
def nodes_for_this_month() -> co.Parallel:
    """
    This function runs in the container for the generate step.
    It returns a node to be executed as part of the execute step.
    """

    # linux utility: fortune
    # python library: sh
    # The above are not dependencies for launching this pipeline, but they
    # must be installed in the image to be referenced by this function.

    os.environ['PATH'] = ':'.join([os.environ['PATH'], "/usr/games"])
    from sh import fortune

    now = datetime.now()
    parent = co.Parallel()
    for i in range(monthrange(now.year, now.month)[1]):

        date = f"{now.year}-{now.month}-{i + 1}"

        fortune_str = fortune()

        cmd = cleandoc(f"""
            echo "About {date} the spirits say:"
            cat << EOF
            {indent(fortune_str, prefix='            ')}
            EOF""")

        parent[date] = co.Exec(cmd)

    return parent
Ejemplo n.º 9
0
def disambiguate() -> co.Parallel:
    with co.Parallel(image=co.Image(copy_dir=".")) as node:

        # no ambiguity here, all kwargs refer to conducto.Node.__init__
        co.Exec('''echo "node has 1.5 cpu's"''', name="A", cpu=1.5)

        # native method parameters come first
        # modify the node object in a second step, then connect it to its parent
        node_obj = co.Exec(myfunc, "DDR4-2933 (quad channel)", cpu=2950)
        node_obj.set(cpu=0.75, mem=1.5)
        node["B"] = node_obj

        # or connect it to its parent, then modify it in place
        node["C"] = co.Exec(myfunc, "DDR4-2667 (dual channel)")
        node["C"].set(cpu=0.75, mem=1.5)

        # some non-custom types don't have obvious string representations
        payload = {"foo": 2, "bar": 3}
        func(payload)

        # so you may have to handle the serialization yourself
        node["D"] = co.Exec(wrappedfunc, json.dumps(payload))

        # custom types work, but you need to provide helpers
        param_obj = Emoticon(happy=True)
        node["E"] = co.Exec(describe, param_obj)

    return node
Ejemplo n.º 10
0
def download_node(data_root=DATA_ROOT,
                  num_shards=500,
                  max_shard=500) -> co.Parallel:
    output = co.Parallel()
    for ds in ["train", "validate", "test"]:
        output[ds] = co.Parallel()
        for shard in range(1, num_shards + 1):
            if shard > max_shard:
                break
            output[ds][f"shard_{shard}_of_{num_shards}"] = co.Exec(f"""
                set -x -o pipefail
                script=`pwd`/download.py
                mkdir -p {DATA_ROOT}
                cd {DATA_ROOT}
                shard={shard},{num_shards} partition=2/frame/{ds} mirror=us python $script
                """)
    return output
Ejemplo n.º 11
0
def main() -> co.Parallel:
    with co.Parallel(image=IMG) as root:
        # Count lines of code in the remote Git repo.
        root["lines of code"] = co.Exec("cloc .")
        # Run a simple data analysis script located there.
        root["biggest US cities"] = co.Exec(
            "cd features/copy_url && python analyze.py cities.csv")
    return root
Ejemplo n.º 12
0
def teardown():
    """
    Stop containers.
    """
    with co.Parallel(image=docker_img, requires_docker=True) as node:
        node["stop redis"] = co.Exec(STOP_REDIS_CMD)
        node["stop flask"] = co.Exec(STOP_FLASK_CMD)
    return node
Ejemplo n.º 13
0
def data_pipeline() -> co.Serial:
    """
    ### **`co.data.pipeline`**
    `co.data.pipeline` is a pipeline-local key-value store. This data is only
    visible to your pipeline and persists until your pipeline is deleted. It
    is useful for writing data in one pipeline step, and reading it in another.

    `co.data.pipeline` has both a python and command line interface as
    `conducto-data-pipeline`. The first node of the example prints the command line
    usage to show the full interface.

    ### Example: Parameter Search
    One useful application is performing and summarizing a parameter search.
    In this example, we try different parameterizations of an algorithm in
    parallel. Each one stores its results using `co.data.pipeline.puts()`. Once
    all of the parallel tasks are done, it reads the results using
    `co.data.pipeline.gets()` and prints a summary.
    """
    # Dockerfile installs python, R, and conducto.
    image = co.Image(dockerfile="docker/Dockerfile.data",
                     context=".",
                     copy_dir="./code",
                     reqs_py=["conducto"])

    data_dir = "demo/data_science/data"

    output = co.Serial(image=image, doc=co.util.magic_doc())
    output["usage"] = co.Exec("conducto-data-pipeline --help")

    output["parameter_search"] = ps = co.Parallel()

    for window in [25, 50, 100]:
        ps[f"window={window}"] = w = co.Parallel()

        for mean in [.05, .08, .11]:
            w[f"mean={mean}"] = m = co.Parallel()

            for volatility in [.1, .125, .15, .2]:
                m[f"volatility={volatility}"] = co.Exec(
                    f"python data.py --window={window} --mean={mean} "
                    f"--volatility={volatility} --data-dir={data_dir}")

    output["summarize"] = co.Exec(f"Rscript data.R {data_dir}")

    return output
Ejemplo n.º 14
0
def main() -> co.Serial:
    with co.Serial(image=IMG, requires_docker=True) as root:
        with co.Parallel(name="Init") as init:
            init["Build"] = co.Exec("docker build .")
            init["Lint"] = co.Exec("black --check .")
            init["Unit Test"] = co.Exec("python unit_test.py")
        root["Deploy"] = co.Exec("bash deploy_aws.sh")
        root["Integration Test"] = co.Exec("bash int_test.sh")
    return root
Ejemplo n.º 15
0
def main() -> co.Serial:
    with co.Serial(image=IMG, requires_docker=True) as root:
        with co.Parallel(name="Init") as init:
            init["Build"] = co.Exec("sleep 3")
            init["Lint"] = co.Exec("sleep 1")
            init["Unit Test"] = co.Exec("sleep 1.5")
        root["Deploy"] = co.Exec("sleep 4")
        root["Integration Test"] = co.Exec("sleep 2")
    return root
Ejemplo n.º 16
0
def build(projects: typing.List[str]) -> co.Parallel:
    "Build projects in parallel, using simple shell command."

    # Override the parent image to use one with docker installed.
    img = co.Image(image="docker:19.03", copy_dir=".")

    output = co.Parallel(image=img, requires_docker=True)
    for project in projects:
        # Command needs docker; inherits flag from parent node
        output[project] = co.Exec(f"cd {project} && docker build .")
    return output
Ejemplo n.º 17
0
def main() -> co.Parallel:
    """
    Dynamically build pipelines for each actor in a static list.
    """
    actors = ["Oprah Winfrey", "Kate Mara", "Don Cheadle", "Dwayne Johnson"]
    root = co.Parallel(image=_get_image())
    for actor in actors:
        root[actor] = co.Lazy(
            f"python pipeline.py all_by_actor '{actor}'"
        )
    return root
Ejemplo n.º 18
0
def pr(branch) -> co.Parallel:
    # Make a Docker image, based on python:alpine, with the whole repo and the contents
    # of the given branch.
    image = co.Image("python:alpine", copy_repo=True, copy_branch=branch)

    # Using that Docker image, run three commands in parallel to interact with the
    # repo's files.
    with co.Parallel(image=image) as root:
        co.Exec(f"echo {branch}", name="print branch")
        co.Exec("pwd", name="print working directory")
        co.Exec("ls -la", name="list files")
    return root
Ejemplo n.º 19
0
def main() -> co.Serial:
    with co.Serial(image=get_image(), doc=__doc__) as root:
        with co.Parallel(name="Initialize"):
            co.Exec("docker build -t my_image .",
                    name="Build",
                    requires_docker=True)
            co.Exec("black --check .", name="Lint")
            co.Exec("python test.py --verbose", name="Unit Test")
        root["Deploy"] = co.Exec(DEPLOY_CMD, requires_docker=True)
        root["Integration Test"] = co.Exec(INTEGRATION_TEST_CMD)
        root["Cleanup"] = co.Exec("docker kill my_app", requires_docker=True)
    return root
Ejemplo n.º 20
0
def compute_covs_node(in_glob, out_dir) -> co.Parallel:
    import glob
    import os
    in_files = sorted(glob.glob(in_glob))

    output = co.Parallel()
    for f in in_files:
        # Input: RAW_DATA_DIR/train3554.tfrecord
        # Output: COVS_ROOT/train3554.pkl.gzip
        base = os.path.basename(f).replace(".tfrecord", "")
        out_path = os.path.join(out_dir, base + ".pkl.gzip")

        if len(in_files) > 50:
            import re
            parent = re.sub("(\d\d)\d\d", "\\1__", base)
            if parent not in output:
                output[parent] = co.Parallel()
            base = f"{parent}/{base}"

        output[base] = co.Exec(commands.compute_cov, f, out_path)
    return output
Ejemplo n.º 21
0
def islands() -> co.Serial:
    with co.Serial() as pipeline:
        pipeline["hawaii"] = co.Exec("echo big island")
        with co.Parallel(name="maui_county") as maui_county:
            maui_county["maui"] = co.Exec("echo valley isle")
            maui_county["lanai"] = co.Exec("echo pineapple isle")
            maui_county["molokai"] = co.Exec("echo friendly isle")
            maui_county["kahoolawe"] = co.Exec("echo target isle")
        pipeline["oahu"] = co.Exec("echo gathering place")
        with co.Serial(name="kauai_county") as kauai_county:
            kauai_county["kauai"] = co.Exec("echo garden isle")
            kauai_county["niihau"] = co.Exec("echo forbidden isle")
    return pipeline
Ejemplo n.º 22
0
def main() -> co.Serial:

    with co.Serial(image=img) as p:  # p is for 'Pipeline root'

        p["get data"] = co.Exec(get_sensor_data)
        p["notify"] = co.Parallel()
        p["notify/stdout"] = co.Exec(plot_to_stdout)
        p["notify/channel"] = co.Exec(plot_to_slack)
        p["notify/team"] = co.Serial()
        for user in update_users:
            p[f"notify/team/{user}"] = co.Exec(message_to_slack_user, user)

    return p
Ejemplo n.º 23
0
def download_and_plot() -> co.Serial:
    download_command = """
            apt update -y && apt install -y curl unzip
            curl https://www.fs.usda.gov/rds/archive/products/RDS-2005-0004/RDS-2005-0004.zip > data.zip
            unzip data.zip
        """
    image = co.Image(dockerfile='./Dockerfile', context='.')
    with co.Serial(image=image) as pipeline:
        co.Exec(download_command, name="download")
        with co.Parallel(name='plot'):
            co.Exec('python rainfall.py', name='daily')
            co.Exec('python rainfall.py --resample M --save', name='monthly')
    return pipeline
Ejemplo n.º 24
0
def path() -> co.Serial:
    """
    The Node tree can be accessed with file system-like
    [paths](/docs/basics/pipeline-structure#path).
    """
    root = co.Serial(image="foo", doc=co.util.magic_doc())
    root["all together"] = co.Parallel()
    root["all together/a"] = co.Exec("echo step 1, image bar", image="bar")
    root["all together/b"] = co.Exec("echo step 1, image foo")
    root["one at a time"] = co.Serial(image="bar")
    root["one at a time/c"] = co.Exec("echo step 2, image bar")
    root["one at a time/d"] = co.Exec("echo step 3, image bar")
    return root
Ejemplo n.º 25
0
def nodes_for_this_month(now):
    parent = co.Parallel()
    for i in range(monthrange(now.year, now.month)[1]):
        date = f"{now.year}-{now.month}-{i}"
        fortune = get_fortune()

        cmd = cleandoc(f"""
            echo "About {date} the spirits say:"
            cat << EOF
            {indent(fortune, prefix='            ')}
            EOF""")
        parent[date] = co.Exec(cmd)
    return parent
Ejemplo n.º 26
0
def dict() -> co.Serial:
    """
    Each Node is [dict-like](/docs/basics/pipeline-structure#dict), and you can
    build a hierarchy by assigning children into them.
    """
    root = co.Serial(image="foo", doc=co.util.magic_doc())
    root["all together"] = co.Parallel()
    root["all together"]["a"] = co.Exec("echo step 1, image bar", image="bar")
    root["all together"]["b"] = co.Exec("echo step 1, image foo")
    root["one at a time"] = co.Serial(image="bar")
    root["one at a time"]["c"] = co.Exec("echo step 2, image bar")
    root["one at a time"]["d"] = co.Exec("echo step 3, image bar")
    return root
Ejemplo n.º 27
0
def run() -> co.Serial:
    "Download data from the US EIA, then visualize some datasets."
    with co.Serial(image=IMG, doc=co.util.magic_doc()) as output:
        # First download some data from the US Energy Information Administration.
        output["Download"] = co.Exec(DOWNLOAD_COMMAND)

        # Then make a few different visualizations of it.
        output["Display"] = co.Parallel()
        for dataset in DATASETS.keys():
            # Use co.Exec shorthand for calling native Python functions.
            # It calls `display(dataset)` in an Exec node. It's equal to:
            #   python pipeline.py display --dataset={dataset}
            output["Display"][dataset] = co.Exec(display, dataset)
    return output
Ejemplo n.º 28
0
def test() -> co.Serial:
    """
    Check if both redis and flask are available.  Then see if they're
    working.
    """

    with co.Serial(image=test_img) as test:

        with co.Parallel(name="services up?") as check:
            check["redis up?"] = co.Exec(TEST_REDIS_CMD)
            check["flask up?"] = co.Exec(TEST_FLASK_CMD)

    test["integration test"] = co.Exec(INTEGRATION_TEST_CMD)
    return test
Ejemplo n.º 29
0
def all_by_actor(actor) -> co.Parallel:
    """
    Return a pipeline listing all Netflix shows with an actor.
    Call with co.Lazy to generate pipeline at runtime.
    """
    df = _load_data()
    titles = df[df.cast.str.contains(actor) | False].title

    output = co.Parallel()
    for title in titles:
        output[title] = co.Exec(
            f"python pipeline.py for_title {repr(title)}"
        )
    return output
Ejemplo n.º 30
0
def parallelize_reps(reps:int) -> co.Parallel:
    output = co.Parallel()
    data_size = reps
    min_rep = 0
    max_rep = reps

    for rep_i in range(min_rep,max_rep):
        print("inside rep " + str(rep_i))
        output[f'rep{rep_i}'] = co.Serial()
        # unpredictable
        output[f'rep{rep_i}']['p1'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 1 && conducto-perm-data put --name rep{rep_i}p1 --file LOD_data.csv")
        # predictable
        output[f'rep{rep_i}']['p0'] = co.Exec(f"{experiment_command} GLOBAL-randomSeed {rep_i} WORLD_CONVEYORBELT-randomize 0 && conducto-perm-data put --name rep{rep_i}p0 --file LOD_data.csv")
    return output