def to_prefect(self): """Compile the recipe to a Prefect.Flow object.""" from prefect import Flow, task, unmapped has_cache_inputs = getattr(self, "cache_inputs", False) if has_cache_inputs: cache_input_task = task(self.cache_input, name="cache_input") prepare_target_task = task(self.prepare_target, name="prepare_target") store_chunk_task = task(self.store_chunk, name="store_chunk") finalize_target_task = task(self.finalize_target, name="finalize_target") with Flow("pangeo-forge-recipe") as flow: if has_cache_inputs: cache_task = cache_input_task.map( input_key=list(self.iter_inputs())) upstream_tasks = [cache_task] else: upstream_tasks = [] prepare_task = prepare_target_task(upstream_tasks=upstream_tasks) store_task = store_chunk_task.map( chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)], ) _ = finalize_target_task(upstream_tasks=[store_task]) return flow
def build_flow(): name = os.path.basename(__file__).strip('.py') kws = dict(log_stdout=True, ) with Flow(name) as flow: t = task(download_github_issues.get_watched_repositories)() t = task(download_github_issues.download_github_issues_for_repo).map(t) t |= task(etl_github_users.main, name='etl_github_users', **kws)() t |= task(etl_github_issues.main, name='etl_github_issues', **kws)() # TODO: flow.schedule = return flow
def build_flow(): name = os.path.basename(__file__).strip('.py') kws = dict(log_stdout=True, ) with Flow(name) as flow: t = task(download_github_issue_events.main, name='download_github_issue_events', **kws)() # noqa t |= task(etl_github_users.main, name='etl_github_users', **kws)() # noqa t |= task(etl_github_issue_events.main, name='etl_github_issue_events', **kws)() # noqa # TODO: flow.schedule = return flow
def run_flow(config, callargs, main=True): name = config.get('name', 'Default') if main: _set_seed(config.get('seed', 0)) # save python code here: # os.path.dirname(os.path.dirname(mlflow.get_artifact_uri())) name = _log_xp(config, name) with Flow(name) as flow: ref = load_flow_inputs(config, main) for op in config.get('flow', []): if op['f'].endswith('.yaml'): subconfig = yaml.load(open(op['f'], 'r'), Loader=yaml.FullLoader) args = dict( zip(subconfig.get('args', []), get_task_args(op, ref, []))) kwargs = get_task_kwargs(op, ref, []) kwargs.update(args) run = task(run_flow)(subconfig, kwargs, main=False) ref.update(get_task_output(op, run)) else: ref = run_task(op, ref) if main: flow.save(f"/tmp/flow.flow") log_artifact(f"/tmp/flow.flow") os.remove(f"/tmp/flow.flow") state = flow.run(**callargs) return return_flow_output(config, ref, state)
def to_prefect(self): """Compile the recipe to a Prefect.Flow object.""" from prefect import Flow, task, unmapped # TODO: allow recipes to customize which stages to run cache_input_task = task(self.cache_input, name="cache_input") prepare_target_task = task(self.prepare_target, name="prepare_target") store_chunk_task = task(self.store_chunk, name="store_chunk") finalize_target_task = task(self.finalize_target, name="finalize_target") with Flow("pangeo-forge-recipe") as flow: cache_task = cache_input_task.map(input_key=list(self.iter_inputs())) upstream_tasks = [cache_task] prepare_task = prepare_target_task(upstream_tasks=upstream_tasks) store_task = store_chunk_task.map( chunk_key=list(self.iter_chunks()), upstream_tasks=[unmapped(prepare_task)], ) _ = finalize_target_task(upstream_tasks=[store_task]) return flow
def test_airflow_task_uses_its_own_trigger_rules_by_default( self, airflow_settings): task = AirflowTask( db_conn=airflow_settings["db_conn"], task_id="run_this_last", dag_id="example_bash_operator", env=airflow_settings, ) with Flow(name="test single task") as flow: res = task(execution_date="2011-01-01") flow_state = flow.run() assert flow_state.is_successful() assert flow_state.result[res].is_skipped()
def test(e: Optional[Executor]): with TemporaryDirectory() as tmpdir: flow_result = LocalResult(tmpdir, serializer=JSONSerializer(), location="{task_name}.json") with Flow("write_result", result=flow_result) as f: _terminal = task(lambda: 42, checkpoint=True, name="magic")() with set_temporary_config({"flows.checkpointing": True}), \ raise_on_exception(): f.run(executor=e) files = os.listdir(tmpdir) assert files == ["magic.json"], files with open(os.path.join(tmpdir, files[0]), "rb") as file: val = json.load(file) assert val==42
def test_airflow_task_uses_cli_flags(self, airflow_settings): task = AirflowTask( db_conn=airflow_settings["db_conn"], task_id="run_this_last", dag_id="example_bash_operator", cli_flags=["-A"], env=airflow_settings, ) with Flow(name="test single task") as flow: res = task(execution_date="2011-01-02") flow_state = flow.run() assert flow_state.is_successful() assert flow_state.result[res].is_successful() assert not flow_state.result[res].is_skipped() assert flow_state.result[res].result is None
def __call__(self, *args: Any, flow: Flow = None, **kwargs: Any) -> ResourceContext: if flow is None: flow = prefect.context.get("flow") if flow is None: raise ValueError("Could not infer an active Flow context.") init_task = prefect.task(self.resource_class, **self.init_task_kwargs)(*args, flow=flow, **kwargs) setup_task = ResourceSetupTask(**self.setup_task_kwargs)(init_task, flow=flow) cleanup_task = ResourceCleanupTask(**self.cleanup_task_kwargs)( init_task, setup_task, flow=flow) return ResourceContext(init_task, setup_task, cleanup_task, flow)
def pipelines_to_plan(self, pipelines: ParallelPipelines) -> Flow: with Flow("rechunker") as flow: for pipeline in pipelines: upstream_tasks = [] # type: List[task] for stage in pipeline.stages: stage_task = task(stage.function, name=stage.name) if stage.mappable is not None: stage_task_called = stage_task.map( list(stage.mappable ), # prefect doesn't accept a generator config=unmapped(pipeline.config), upstream_tasks=[ unmapped(t) for t in upstream_tasks ], ) else: stage_task_called = stage_task( config=pipeline.config, upstream_tasks=upstream_tasks) upstream_tasks = [stage_task_called] return flow
bias_correct_obs_by_method, ) from cmip6_downscaling.workflows.paths import ( build_gcm_identifier, build_obs_identifier, make_annual_pyramid_path, make_bias_corrected_gcm_path, make_bias_corrected_obs_path, make_daily_pyramid_path, make_interpolated_gcm_path, make_interpolated_obs_path, make_monthly_pyramid_path, ) from cmip6_downscaling.workflows.utils import BBox, rechunk_zarr_array_with_caching, regrid_ds get_obs_task = task(get_obs) get_gcm_task = task(get_gcm) @task def build_bbox(latmin: str, latmax: str, lonmin: str, lonmax: str) -> dataclass: """Build bounding box out of lat/lon inputs using BBox data class defined in /utils.py Args: Paramters --------- latmin : float Minimum latitude latmax : float Maximum latitude
runtime = runtimes.get_runtime() intermediate_cache_store = CacheStore( config.get("storage.intermediate.uri"), storage_options=config.get("storage.intermediate.storage_options"), ) results_cache_store = CacheStore( config.get("storage.results.uri"), storage_options=config.get("storage.results.storage_options"), ) # Transform Functions into Tasks ----------------------------------------------------------- return_obs_task = task( return_obs, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_return_obs_path, ) get_coarse_obs_task = task( get_coarse_obs, tags=['dask-resource:TASKSLOTS=1'], result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_coarse_obs_path, ) get_spatial_anomalies_task = task( get_spatial_anomalies, tags=['dask-resource:TASKSLOTS=1'], result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_spatial_anomalies_path, ) return_coarse_obs_full_time_task = task(
def _define_model_selection_flow(): """Define flow that runs model selection. Specifically data filtering, partitioning and model selection and optional persistence on a given dataset Returns ------- prefect.Flow """ from prefect import task, Flow, Parameter, unmapped with Flow("model selection") as flow: df = Parameter("data") grid_search = Parameter("grid_search") target_col_name = Parameter("target_col_name") country_code_column = Parameter("country_code_column") include_rules = Parameter("include_rules") exclude_rules = Parameter("exclude_rules") parallel_over_columns = Parameter("parallel_over_columns") partition_columns = Parameter("partition_columns") frequency = Parameter("frequency") output_path = Parameter("output_path") persist_cv_data = Parameter("persist_cv_data") persist_cv_results = Parameter("persist_cv_results") persist_model_reprs = Parameter("persist_model_reprs") persist_best_model = Parameter("persist_best_model") persist_partition = Parameter("persist_partition") persist_model_selector_results = Parameter( "persist_model_selector_results") df_filtered = task(filter_data)(df=df, include_rules=include_rules, exclude_rules=exclude_rules) partitions = task(partition_data)(df=df_filtered, partition_by=parallel_over_columns) parallel_over_dicts, partition_dfs = partitions["labels"], partitions[ "data"] train_data = task(prepare_data_for_training).map( df=partition_dfs, frequency=unmapped(frequency), partition_columns=unmapped(partition_columns), parallel_over_columns=unmapped(parallel_over_columns), country_code_column=unmapped(country_code_column), ) results = task(select_model).map( df=train_data, target_col_name=unmapped(target_col_name), grid_search=unmapped(grid_search), partition_columns=unmapped(partition_columns), parallel_over_dict=parallel_over_dicts, frequency=unmapped(frequency), country_code_column=unmapped(country_code_column), ) write_ok = task(persist_experts_in_physical_partition).map( results=results, folder_path=unmapped(output_path), persist_cv_results=unmapped(persist_cv_results), persist_cv_data=unmapped(persist_cv_data), persist_model_reprs=unmapped(persist_model_reprs), persist_best_model=unmapped(persist_best_model), persist_partition=unmapped(persist_partition), persist_model_selector_results=unmapped( persist_model_selector_results), ) flow.set_reference_tasks([write_ok]) return flow
def _run(op, args, kwargs, maps): if len(maps): return task(getattr(tb, op['f'])).map(*args, **kwargs) else: return task(getattr(tb, op['f']))(*args, **kwargs)
# prepare a placeholder class for prefect tasks created from nuggets ptask = lambda: None # pylint: disable=C0103 # gather list of nuggets NUGGET_LIST = [ f for f in dir(nuggets) # exclude built-ins and include only functions if not f.startswith("__") and isinstance(getattr(nuggets, f), Callable) ] # for each nugget, add it (and a parent) attr to ptask for nugget in NUGGET_LIST: # parent attr for task parent = getattr(nuggets, nugget).__module__[20:] # if no parent attr attached to ptask, set it if not hasattr(ptask, parent): setattr( ptask, parent, lambda: None, ) # set attr per parent attr within ptask setattr( getattr(ptask, parent), getattr(nuggets, nugget).__name__, prefect.task(getattr(nuggets, nugget)), )
@task def read_files(files): dfs = [ ( pd.read_csv(file, sep=";") .rename(columns=str.lower) .assign(jaar=lambda x: int(str(file)[-20:-15])) # TODO: jaar as first column ) for file in files ] return pd.concat(dfs) to_gbq = task(pandas_gbq.to_gbq) # gemeente.to_gbq('vektis.open_data_gemeente', project_id=CONFIG.gcp.project, if_exists='replace', location=CONFIG.gcp.location) with Flow("Vektis open data") as flow: gcp_project = Parameter("gcp_project", required=True) gcp_location = Parameter("gcp_location", required=True) dataframes = read_files.map([files_gemeente, files_pc3]) to_gbq = to_gbq.map( dataframe=dataframes, destination_table=["vektis.open_data_gemeente", "vektis.open_data_pc3"], project_id=unmapped(gcp_project), if_exists=unmapped("replace"), location=unmapped(gcp_location), )
def test_result_pipe(): t = prefect.task(lambda x, foo: x + 1) with prefect.Flow("test"): # A task created using .pipe should be identical to one created by using __call__ assert vars(t(1, foo="bar")) == vars(t.pipe(t, foo="bar"))
) from cmip6_downscaling.workflows.utils import rechunk_zarr_array_with_caching runtime = get_runtime() intermediate_cache_store = CacheStore( config.get('storage.intermediate.uri'), storage_options=config.get('storage.intermediate.storage_options'), ) results_cache_store = CacheStore( config.get('storage.results.uri'), storage_options=config.get('storage.results.storage_options')) fit_and_predict_task = task( gard_fit_and_predict, checkpoint=True, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_gard_predict_output_path, ) read_scrf_task = task(read_scrf, ) gard_postprocess_task = task( gard_postprocess, result=XpersistResult(results_cache_store, serializer="xarray.zarr"), target=make_gard_post_processed_output_path, ) @task(nout=3) def prep_gard_input_task( obs: str,
historical_period=historical_period, day_rolling_window=day_rolling_window, year_rolling_window=year_rolling_window, ) hist_trend = trend.sel(time=historical_period) pred_trend = trend.sel(time=predict_period) trend = xr.combine_by_coords([hist_trend, pred_trend], combine_attrs='drop_conflicts') return trend remove_epoch_trend_task = task( remove_epoch_trend, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_epoch_adjusted_gcm_path, ) @task( checkpoint=True, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_bias_corrected_gcm_path, ) def maca_coarse_bias_correction_task( ds_gcm: xr.Dataset, ds_obs: xr.Dataset, train_period_start: str, train_period_end: str, variables: Union[str, List[str]],
fp_out = os.path.join(dir_out, f"{year}-{abbr}.csv") df_sched.to_csv(fp_out, index=False) return (year, abbr, df_sched) # Triggers: https://docs.prefect.io/core/concepts/execution.html#triggers # run the result if any of the above mapped results are successful def concat_all_csv(dir_out, filename="yr2020"): df = pd.concat(map(pd.read_csv, glob.glob(f"{dir_out}/*.csv"))) fp_out = os.path.join(dir_out, f"../{filename}.csv") df.to_csv(fp_out, index=False) # Crate some custom tasks task_concat = task(concat_all_csv, trigger=any_successful) with Flow("Raw Pull") as flow_pull: dir_out = Parameter("dir_out") from_year = Parameter("from_year") thru_year = Parameter("thru_year") abbrs = get_list_of_team_abbrs() year_abbrs = create_year_abbr_combo(abbrs, from_year=from_year, thru_year=thru_year) dfs = return_schedule_dataframe.map(year_abbrs, dir_out=unmapped(dir_out)) concat = task_concat(dir_out, upstream_tasks=[dfs]) if __name__ == "__main__":
return response.text @task def load_file(filename: str) -> str: with open(filename, "r", encoding="utf-8") as file: return file.read() @task def printa(stuff): print(stuff) task = ShellTask(return_all=True) with Flow("shell") as f: translation_server_url = "http://localhost:1969" bibtex = load_file("./workspace/aksw-short.bib") zotero = import_translation(bibtex, translation_server_url) rdf = export_translation(zotero, translation_server_url, "rdf_bibliontology") turtle = task(command="rapper - -o turtle -I www.test.com > tests.ttl") printa(turtle) f.run_config = DockerRun(image="prefecthq/prefect") f.register(project_name="tutoriala") # Configure extra environment variables for this flow, # and set a custom image # f.run()
""" # Mapping [Mapping](/core/concepts/mapping.md) in Prefect can be used to apply the same task (or tasks) to multiple inputs. To map a task, use its `.map` method instead of calling the task itself. ```python from prefect import task @task def add(x, y): return x + y add(1, 2) # 3 add.map([1, 10], [2, 3]) # [3, 13] ``` By default all arguments to `.map` expect iterables that can be mapped over. If you want to pass in a non-iterable argument to be used by all branches in a mapped task, you can wrap that argument in `unmapped` ([docs](/core/concepts/mapping.md#unmapped-inputs)). ```python from prefect import unmapped add.map([1, 10], unmapped(2)) # [3, 12] ``` In this example we build a flow with 4 stages: