Ejemplo n.º 1
0
    def to_prefect(self):
        """Compile the recipe to a Prefect.Flow object."""
        from prefect import Flow, task, unmapped

        has_cache_inputs = getattr(self, "cache_inputs", False)
        if has_cache_inputs:
            cache_input_task = task(self.cache_input, name="cache_input")
        prepare_target_task = task(self.prepare_target, name="prepare_target")
        store_chunk_task = task(self.store_chunk, name="store_chunk")
        finalize_target_task = task(self.finalize_target,
                                    name="finalize_target")

        with Flow("pangeo-forge-recipe") as flow:
            if has_cache_inputs:
                cache_task = cache_input_task.map(
                    input_key=list(self.iter_inputs()))
                upstream_tasks = [cache_task]
            else:
                upstream_tasks = []
            prepare_task = prepare_target_task(upstream_tasks=upstream_tasks)
            store_task = store_chunk_task.map(
                chunk_key=list(self.iter_chunks()),
                upstream_tasks=[unmapped(prepare_task)],
            )
            _ = finalize_target_task(upstream_tasks=[store_task])

        return flow
Ejemplo n.º 2
0
def build_flow():
    name = os.path.basename(__file__).strip('.py')
    kws = dict(log_stdout=True, )
    with Flow(name) as flow:

        t = task(download_github_issues.get_watched_repositories)()

        t = task(download_github_issues.download_github_issues_for_repo).map(t)
        t |= task(etl_github_users.main, name='etl_github_users', **kws)()
        t |= task(etl_github_issues.main, name='etl_github_issues', **kws)()

    # TODO: flow.schedule =
    return flow
Ejemplo n.º 3
0
def build_flow():
    name = os.path.basename(__file__).strip('.py')
    kws = dict(log_stdout=True, )
    with Flow(name) as flow:

        t = task(download_github_issue_events.main,
                 name='download_github_issue_events',
                 **kws)()  # noqa
        t |= task(etl_github_users.main, name='etl_github_users',
                  **kws)()  # noqa
        t |= task(etl_github_issue_events.main,
                  name='etl_github_issue_events',
                  **kws)()  # noqa

    # TODO: flow.schedule =
    return flow
Ejemplo n.º 4
0
def run_flow(config, callargs, main=True):

    name = config.get('name', 'Default')
    if main:
        _set_seed(config.get('seed', 0))
        # save python code here:
        # os.path.dirname(os.path.dirname(mlflow.get_artifact_uri()))
        name = _log_xp(config, name)

    with Flow(name) as flow:
        ref = load_flow_inputs(config, main)
        for op in config.get('flow', []):
            if op['f'].endswith('.yaml'):
                subconfig = yaml.load(open(op['f'], 'r'),
                                      Loader=yaml.FullLoader)
                args = dict(
                    zip(subconfig.get('args', []), get_task_args(op, ref, [])))
                kwargs = get_task_kwargs(op, ref, [])
                kwargs.update(args)
                run = task(run_flow)(subconfig, kwargs, main=False)
                ref.update(get_task_output(op, run))
            else:
                ref = run_task(op, ref)
    if main:
        flow.save(f"/tmp/flow.flow")
        log_artifact(f"/tmp/flow.flow")
        os.remove(f"/tmp/flow.flow")
    state = flow.run(**callargs)
    return return_flow_output(config, ref, state)
Ejemplo n.º 5
0
    def to_prefect(self):
        """Compile the recipe to a Prefect.Flow object."""
        from prefect import Flow, task, unmapped

        # TODO: allow recipes to customize which stages to run
        cache_input_task = task(self.cache_input, name="cache_input")
        prepare_target_task = task(self.prepare_target, name="prepare_target")
        store_chunk_task = task(self.store_chunk, name="store_chunk")
        finalize_target_task = task(self.finalize_target, name="finalize_target")

        with Flow("pangeo-forge-recipe") as flow:
            cache_task = cache_input_task.map(input_key=list(self.iter_inputs()))
            upstream_tasks = [cache_task]
            prepare_task = prepare_target_task(upstream_tasks=upstream_tasks)
            store_task = store_chunk_task.map(
                chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)],
            )
            _ = finalize_target_task(upstream_tasks=[store_task])

        return flow
Ejemplo n.º 6
0
    def test_airflow_task_uses_its_own_trigger_rules_by_default(
            self, airflow_settings):
        task = AirflowTask(
            db_conn=airflow_settings["db_conn"],
            task_id="run_this_last",
            dag_id="example_bash_operator",
            env=airflow_settings,
        )

        with Flow(name="test single task") as flow:
            res = task(execution_date="2011-01-01")
        flow_state = flow.run()

        assert flow_state.is_successful()
        assert flow_state.result[res].is_skipped()
Ejemplo n.º 7
0
def test(e: Optional[Executor]):
    with TemporaryDirectory() as tmpdir:
        flow_result = LocalResult(tmpdir, serializer=JSONSerializer(),
                                  location="{task_name}.json")

        with Flow("write_result", result=flow_result) as f:
            _terminal = task(lambda: 42, checkpoint=True, name="magic")()

        with set_temporary_config({"flows.checkpointing": True}), \
             raise_on_exception():
            f.run(executor=e)

        files = os.listdir(tmpdir)
        assert files == ["magic.json"], files
        with open(os.path.join(tmpdir, files[0]), "rb") as file:
            val = json.load(file)
        assert val==42
Ejemplo n.º 8
0
    def test_airflow_task_uses_cli_flags(self, airflow_settings):
        task = AirflowTask(
            db_conn=airflow_settings["db_conn"],
            task_id="run_this_last",
            dag_id="example_bash_operator",
            cli_flags=["-A"],
            env=airflow_settings,
        )

        with Flow(name="test single task") as flow:
            res = task(execution_date="2011-01-02")
        flow_state = flow.run()

        assert flow_state.is_successful()
        assert flow_state.result[res].is_successful()
        assert not flow_state.result[res].is_skipped()
        assert flow_state.result[res].result is None
Ejemplo n.º 9
0
    def __call__(self,
                 *args: Any,
                 flow: Flow = None,
                 **kwargs: Any) -> ResourceContext:
        if flow is None:
            flow = prefect.context.get("flow")
            if flow is None:
                raise ValueError("Could not infer an active Flow context.")

        init_task = prefect.task(self.resource_class,
                                 **self.init_task_kwargs)(*args,
                                                          flow=flow,
                                                          **kwargs)

        setup_task = ResourceSetupTask(**self.setup_task_kwargs)(init_task,
                                                                 flow=flow)

        cleanup_task = ResourceCleanupTask(**self.cleanup_task_kwargs)(
            init_task, setup_task, flow=flow)

        return ResourceContext(init_task, setup_task, cleanup_task, flow)
Ejemplo n.º 10
0
 def pipelines_to_plan(self, pipelines: ParallelPipelines) -> Flow:
     with Flow("rechunker") as flow:
         for pipeline in pipelines:
             upstream_tasks = []  # type: List[task]
             for stage in pipeline.stages:
                 stage_task = task(stage.function, name=stage.name)
                 if stage.mappable is not None:
                     stage_task_called = stage_task.map(
                         list(stage.mappable
                              ),  # prefect doesn't accept a generator
                         config=unmapped(pipeline.config),
                         upstream_tasks=[
                             unmapped(t) for t in upstream_tasks
                         ],
                     )
                 else:
                     stage_task_called = stage_task(
                         config=pipeline.config,
                         upstream_tasks=upstream_tasks)
                 upstream_tasks = [stage_task_called]
     return flow
Ejemplo n.º 11
0
    bias_correct_obs_by_method,
)
from cmip6_downscaling.workflows.paths import (
    build_gcm_identifier,
    build_obs_identifier,
    make_annual_pyramid_path,
    make_bias_corrected_gcm_path,
    make_bias_corrected_obs_path,
    make_daily_pyramid_path,
    make_interpolated_gcm_path,
    make_interpolated_obs_path,
    make_monthly_pyramid_path,
)
from cmip6_downscaling.workflows.utils import BBox, rechunk_zarr_array_with_caching, regrid_ds

get_obs_task = task(get_obs)
get_gcm_task = task(get_gcm)


@task
def build_bbox(latmin: str, latmax: str, lonmin: str,
               lonmax: str) -> dataclass:
    """Build bounding box out of lat/lon inputs using BBox data class defined in /utils.py

    Args:
    Paramters
    ---------
    latmin : float
         Minimum latitude
    latmax : float
         Maximum latitude
Ejemplo n.º 12
0
runtime = runtimes.get_runtime()

intermediate_cache_store = CacheStore(
    config.get("storage.intermediate.uri"),
    storage_options=config.get("storage.intermediate.storage_options"),
)
results_cache_store = CacheStore(
    config.get("storage.results.uri"),
    storage_options=config.get("storage.results.storage_options"),
)

# Transform Functions into Tasks -----------------------------------------------------------

return_obs_task = task(
    return_obs,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_return_obs_path,
)
get_coarse_obs_task = task(
    get_coarse_obs,
    tags=['dask-resource:TASKSLOTS=1'],
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_coarse_obs_path,
)
get_spatial_anomalies_task = task(
    get_spatial_anomalies,
    tags=['dask-resource:TASKSLOTS=1'],
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_spatial_anomalies_path,
)
return_coarse_obs_full_time_task = task(
Ejemplo n.º 13
0
def _define_model_selection_flow():
    """Define flow that runs model selection.

    Specifically data filtering, partitioning and model selection
    and optional persistence on a given dataset

    Returns
    -------
    prefect.Flow
    """

    from prefect import task, Flow, Parameter, unmapped

    with Flow("model selection") as flow:
        df = Parameter("data")
        grid_search = Parameter("grid_search")
        target_col_name = Parameter("target_col_name")
        country_code_column = Parameter("country_code_column")
        include_rules = Parameter("include_rules")
        exclude_rules = Parameter("exclude_rules")
        parallel_over_columns = Parameter("parallel_over_columns")
        partition_columns = Parameter("partition_columns")
        frequency = Parameter("frequency")
        output_path = Parameter("output_path")
        persist_cv_data = Parameter("persist_cv_data")
        persist_cv_results = Parameter("persist_cv_results")
        persist_model_reprs = Parameter("persist_model_reprs")
        persist_best_model = Parameter("persist_best_model")
        persist_partition = Parameter("persist_partition")
        persist_model_selector_results = Parameter(
            "persist_model_selector_results")
        df_filtered = task(filter_data)(df=df,
                                        include_rules=include_rules,
                                        exclude_rules=exclude_rules)

        partitions = task(partition_data)(df=df_filtered,
                                          partition_by=parallel_over_columns)

        parallel_over_dicts, partition_dfs = partitions["labels"], partitions[
            "data"]

        train_data = task(prepare_data_for_training).map(
            df=partition_dfs,
            frequency=unmapped(frequency),
            partition_columns=unmapped(partition_columns),
            parallel_over_columns=unmapped(parallel_over_columns),
            country_code_column=unmapped(country_code_column),
        )
        results = task(select_model).map(
            df=train_data,
            target_col_name=unmapped(target_col_name),
            grid_search=unmapped(grid_search),
            partition_columns=unmapped(partition_columns),
            parallel_over_dict=parallel_over_dicts,
            frequency=unmapped(frequency),
            country_code_column=unmapped(country_code_column),
        )

        write_ok = task(persist_experts_in_physical_partition).map(
            results=results,
            folder_path=unmapped(output_path),
            persist_cv_results=unmapped(persist_cv_results),
            persist_cv_data=unmapped(persist_cv_data),
            persist_model_reprs=unmapped(persist_model_reprs),
            persist_best_model=unmapped(persist_best_model),
            persist_partition=unmapped(persist_partition),
            persist_model_selector_results=unmapped(
                persist_model_selector_results),
        )

    flow.set_reference_tasks([write_ok])

    return flow
Ejemplo n.º 14
0
def _run(op, args, kwargs, maps):
    if len(maps):
        return task(getattr(tb, op['f'])).map(*args, **kwargs)
    else:
        return task(getattr(tb, op['f']))(*args, **kwargs)
Ejemplo n.º 15
0
# prepare a placeholder class for prefect tasks created from nuggets
ptask = lambda: None  # pylint: disable=C0103

# gather list of nuggets
NUGGET_LIST = [
    f for f in dir(nuggets)
    # exclude built-ins and include only functions
    if not f.startswith("__") and isinstance(getattr(nuggets, f), Callable)
]

# for each nugget, add it (and a parent) attr to ptask
for nugget in NUGGET_LIST:

    # parent attr for task
    parent = getattr(nuggets, nugget).__module__[20:]

    # if no parent attr attached to ptask, set it
    if not hasattr(ptask, parent):
        setattr(
            ptask,
            parent,
            lambda: None,
        )

    # set attr per parent attr within ptask
    setattr(
        getattr(ptask, parent),
        getattr(nuggets, nugget).__name__,
        prefect.task(getattr(nuggets, nugget)),
    )
Ejemplo n.º 16
0
@task
def read_files(files):
    dfs = [
        (
            pd.read_csv(file, sep=";")
            .rename(columns=str.lower)
            .assign(jaar=lambda x: int(str(file)[-20:-15]))
            # TODO: jaar as first column
        )
        for file in files
    ]
    return pd.concat(dfs)


to_gbq = task(pandas_gbq.to_gbq)

# gemeente.to_gbq('vektis.open_data_gemeente', project_id=CONFIG.gcp.project, if_exists='replace', location=CONFIG.gcp.location)

with Flow("Vektis open data") as flow:
    gcp_project = Parameter("gcp_project", required=True)
    gcp_location = Parameter("gcp_location", required=True)
    dataframes = read_files.map([files_gemeente, files_pc3])
    to_gbq = to_gbq.map(
        dataframe=dataframes,
        destination_table=["vektis.open_data_gemeente", "vektis.open_data_pc3"],
        project_id=unmapped(gcp_project),
        if_exists=unmapped("replace"),
        location=unmapped(gcp_location),
    )
Ejemplo n.º 17
0
def test_result_pipe():
    t = prefect.task(lambda x, foo: x + 1)

    with prefect.Flow("test"):
        # A task created using .pipe should be identical to one created by using __call__
        assert vars(t(1, foo="bar")) == vars(t.pipe(t, foo="bar"))
Ejemplo n.º 18
0
)
from cmip6_downscaling.workflows.utils import rechunk_zarr_array_with_caching

runtime = get_runtime()

intermediate_cache_store = CacheStore(
    config.get('storage.intermediate.uri'),
    storage_options=config.get('storage.intermediate.storage_options'),
)
results_cache_store = CacheStore(
    config.get('storage.results.uri'),
    storage_options=config.get('storage.results.storage_options'))

fit_and_predict_task = task(
    gard_fit_and_predict,
    checkpoint=True,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_gard_predict_output_path,
)

read_scrf_task = task(read_scrf, )

gard_postprocess_task = task(
    gard_postprocess,
    result=XpersistResult(results_cache_store, serializer="xarray.zarr"),
    target=make_gard_post_processed_output_path,
)


@task(nout=3)
def prep_gard_input_task(
    obs: str,
Ejemplo n.º 19
0
        historical_period=historical_period,
        day_rolling_window=day_rolling_window,
        year_rolling_window=year_rolling_window,
    )

    hist_trend = trend.sel(time=historical_period)
    pred_trend = trend.sel(time=predict_period)

    trend = xr.combine_by_coords([hist_trend, pred_trend],
                                 combine_attrs='drop_conflicts')
    return trend


remove_epoch_trend_task = task(
    remove_epoch_trend,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_epoch_adjusted_gcm_path,
)


@task(
    checkpoint=True,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_bias_corrected_gcm_path,
)
def maca_coarse_bias_correction_task(
    ds_gcm: xr.Dataset,
    ds_obs: xr.Dataset,
    train_period_start: str,
    train_period_end: str,
    variables: Union[str, List[str]],
Ejemplo n.º 20
0
    fp_out = os.path.join(dir_out, f"{year}-{abbr}.csv")
    df_sched.to_csv(fp_out, index=False)
    return (year, abbr, df_sched)


# Triggers: https://docs.prefect.io/core/concepts/execution.html#triggers
# run the result if any of the above mapped results are successful
def concat_all_csv(dir_out, filename="yr2020"):
    df = pd.concat(map(pd.read_csv, glob.glob(f"{dir_out}/*.csv")))
    fp_out = os.path.join(dir_out, f"../{filename}.csv")
    df.to_csv(fp_out, index=False)


# Crate some custom tasks
task_concat = task(concat_all_csv, trigger=any_successful)

with Flow("Raw Pull") as flow_pull:

    dir_out = Parameter("dir_out")
    from_year = Parameter("from_year")
    thru_year = Parameter("thru_year")

    abbrs = get_list_of_team_abbrs()
    year_abbrs = create_year_abbr_combo(abbrs,
                                        from_year=from_year,
                                        thru_year=thru_year)
    dfs = return_schedule_dataframe.map(year_abbrs, dir_out=unmapped(dir_out))
    concat = task_concat(dir_out, upstream_tasks=[dfs])

if __name__ == "__main__":
Ejemplo n.º 21
0
    return response.text


@task
def load_file(filename: str) -> str:
    with open(filename, "r", encoding="utf-8") as file:
        return file.read()


@task
def printa(stuff):
    print(stuff)


task = ShellTask(return_all=True)
with Flow("shell") as f:
    translation_server_url = "http://localhost:1969"
    bibtex = load_file("./workspace/aksw-short.bib")
    zotero = import_translation(bibtex, translation_server_url)
    rdf = export_translation(zotero, translation_server_url,
                             "rdf_bibliontology")
    turtle = task(command="rapper - -o turtle -I www.test.com > tests.ttl")
    printa(turtle)

f.run_config = DockerRun(image="prefecthq/prefect")
f.register(project_name="tutoriala")

# Configure extra environment variables for this flow,
# and set a custom image
# f.run()
Ejemplo n.º 22
0
"""
# Mapping

[Mapping](/core/concepts/mapping.md) in Prefect can be used to apply the same
task (or tasks) to multiple inputs. To map a task, use its `.map` method
instead of calling the task itself.

```python
from prefect import task

@task
def add(x, y):
    return x + y

add(1, 2)  # 3
add.map([1, 10], [2, 3])  # [3, 13]
```

By default all arguments to `.map` expect iterables that can be mapped over. If
you want to pass in a non-iterable argument to be used by all branches in a
mapped task, you can wrap that argument in `unmapped`
([docs](/core/concepts/mapping.md#unmapped-inputs)).

```python
from prefect import unmapped

add.map([1, 10], unmapped(2))  # [3, 12]
```

In this example we build a flow with 4 stages: