def test_required_parameters_must_be_provided(): flow = Flow(name="test") y = prefect.Parameter("y") flow.add_task(y) flow_state = FlowRunner(flow=flow).run(return_tasks=[y]) assert isinstance(flow_state, Failed) assert isinstance(flow_state.result[y], Failed) assert "required but not provided" in str(flow_state.result[y]).lower()
def test_init_errors_if_tasks_passed_to_parameters(self, cloud_api): with pytest.raises(TypeError, match="An instance of `Task` was passed"): StartFlowRun(name="testing", parameters={ "a": 1, "b": prefect.Parameter("b") })
def test_parameters_are_placed_into_context(): flow = Flow(name="test") y = prefect.Parameter("y", default=99) flow.add_task(y) flow_state = FlowRunner(flow=flow).run(return_tasks=[y], parameters=dict(y=42)) assert isinstance(flow_state, Success) assert flow_state.result[y].result == 42
async def test_create_run_with_missing_parameters_raises_error(self, ): flow_id = await flows.create_flow(serialized_flow=prefect.Flow( name="test", tasks=[prefect.Parameter("x")]).serialize(), ) with pytest.raises(ValueError) as exc: await runs.create_flow_run(flow_id=flow_id) assert "Required parameters were not supplied" in str(exc.value)
def test_parameters_can_be_set_in_context_if_none_passed(): x = prefect.Parameter("x") f = FlowRunner(Flow(name="test", tasks=[x])) state = f.run(parameters={}, context={"parameters": { "x": 5 }}, return_tasks=[x]) assert state.result[x].result == 5
def test_parameters_overwrite_context(): x = prefect.Parameter("x") f = FlowRunner(Flow(name="test", tasks=[x])) state = f.run(parameters={"x": 2}, context={"parameters": { "x": 5 }}, return_tasks=[x]) assert state.result[x].result == 2
async def test_set_schedule_active_handles_scheduled_param_defaults( self, project_id): a, b = prefect.Parameter("a"), prefect.Parameter("b", default=1) clock = prefect.schedules.clocks.CronClock(cron=f"* * * * *", parameter_defaults={ "a": 1, "b": 2 }) schedule = prefect.schedules.Schedule(clocks=[clock]) flow = prefect.Flow("test-params", tasks=[a, b], schedule=schedule) flow_id = await api.flows.create_flow( project_id=project_id, serialized_flow=flow.serialize(), set_schedule_active=False, ) assert flow_id assert await api.flows.set_schedule_active(flow_id=flow_id)
def test_auto_manage_dataframe_incorrect_type(local_file, caplog): task = TaskWithInputDataFrame() with Flow('test_auto_manage_dataframe_default_string') as flow: file = prefect.Parameter('local_file', default=123456) task(input_one=file) with pytest.raises(ValueError), caplog.at_level('FATAL', logger='prefect'): with raise_on_exception(), prefect.context(caches={}): flow.run()
async def test_create_flow_registers_flow_even_when_required_params( self, project_id): """ We allow Flow registration to proceed even when there are required params, but we don't set the schedule to active. """ a, b = prefect.Parameter("a"), prefect.Parameter("b", default=1) clock = prefect.schedules.clocks.CronClock(cron=f"* * * * *") schedule = prefect.schedules.Schedule(clocks=[clock]) flow = prefect.Flow("test-params", tasks=[a, b], schedule=schedule) flow_id = await api.flows.create_flow(project_id=project_id, serialized_flow=flow.serialize()) assert flow_id db_flow = await models.Flow.where(id=flow_id ).first({"is_schedule_active"}) assert db_flow.is_schedule_active is False
def test_client_register_doesnt_raise_for_scheduled_params( patch_post, compressed, monkeypatch): if compressed: response = { "data": { "project": [{ "id": "proj-id" }], "create_flow_from_compressed_string": { "id": "long-id" }, } } else: response = { "data": { "project": [{ "id": "proj-id" }], "create_flow": { "id": "long-id" } } } patch_post(response) monkeypatch.setattr("prefect.client.Client.get_default_tenant_slug", MagicMock(return_value="tslug")) with set_temporary_config({ "cloud.api": "http://my-cloud.foo", "cloud.auth_token": "secret_token" }): client = Client() a = prefect.schedules.clocks.DatesClock( [pendulum.now("UTC").add(seconds=0.1)], parameter_defaults=dict(x=1)) b = prefect.schedules.clocks.DatesClock( [pendulum.now("UTC").add(seconds=0.25)], parameter_defaults=dict(x=2)) x = prefect.Parameter("x", required=True) flow = prefect.Flow("test", schedule=prefect.schedules.Schedule(clocks=[a, b]), tasks=[x]) flow.storage = prefect.environments.storage.Memory() flow.result_handler = flow.storage.result_handler flow_id = client.register( flow, project_name="my-default-project", compressed=compressed, version_group_id=str(uuid.uuid4()), ) assert flow_id == "long-id"
async def test_create_run_uses_default_flow_parameters(self, project_id): flow_id = await api.flows.create_flow( project_id=project_id, serialized_flow=prefect.Flow( name="test", tasks=[prefect.Parameter("x", default=1)]).serialize(), ) flow_run_id = await api.runs.create_flow_run(flow_id=flow_id) flow_run = await models.FlowRun.where(id=flow_run_id ).first({"parameters"}) assert flow_run.parameters == dict(x=1)
def test_client_register_raises_if_required_param_isnt_scheduled( patch_post, monkeypatch, tmpdir): response = { "data": { "project": [{ "id": "proj-id" }], "create_flow": { "id": "long-id" } } } patch_post(response) monkeypatch.setattr("prefect.client.Client.get_default_tenant_slug", MagicMock(return_value="tslug")) with set_temporary_config({ "cloud.api": "http://my-cloud.foo", "cloud.auth_token": "secret_token", "backend": "cloud", }): client = Client() a = prefect.schedules.clocks.DatesClock( [pendulum.now("UTC").add(seconds=0.1)], parameter_defaults=dict(x=1)) b = prefect.schedules.clocks.DatesClock( [pendulum.now("UTC").add(seconds=0.25)], parameter_defaults=dict(y=2)) x = prefect.Parameter("x", required=True) flow = prefect.Flow("test", schedule=prefect.schedules.Schedule(clocks=[a, b]), tasks=[x]) flow.storage = prefect.environments.storage.Local(tmpdir) flow.result = flow.storage.result with pytest.raises( ClientError, match= "Flows with required parameters can not be scheduled automatically", ): flow_id = client.register( flow, project_name="my-default-project", compressed=False, version_group_id=str(uuid.uuid4()), no_url=True, )
async def test_set_schedule_active_with_required_parameters( self, project_id): flow = prefect.Flow( name="test", tasks=[prefect.Parameter("p", required=True)], schedule=prefect.schedules.IntervalSchedule( start_date=pendulum.now("EST"), interval=datetime.timedelta(minutes=1)), ) flow_id = await api.flows.create_flow( serialized_flow=flow.serialize(), project_id=project_id, set_schedule_active=False, ) with pytest.raises(ValueError, match="required parameters"): await api.flows.set_schedule_active(flow_id=flow_id)
def test_auto_manage_dataframe_init_overwrite(local_file): task = TaskWithInputDataFrame(input_one_key='/bar') with Flow('test_auto_manage_dataframe_init_overwrite') as flow: file = prefect.Parameter('local_file', default=local_file) task(input_one=file) with raise_on_exception(), prefect.context(caches={}): flow_state = flow.run() result = list(flow_state.result.values())[0].result df_bar = pd.read_hdf(local_file.file, '/bar') assert isinstance(df_bar, pd.DataFrame) tm.assert_equal(result, df_bar.mean())
async def flow_id(): flow = prefect.Flow( name="Test Flow", schedule=prefect.schedules.IntervalSchedule( start_date=pendulum.datetime(2018, 1, 1), interval=datetime.timedelta(days=1), ), ) flow.add_edge( prefect.Task("t1", tags={"red", "blue"}), prefect.Task("t2", tags={"red", "green"}), ) flow.add_task(prefect.Parameter("x", default=1)) flow_id = await api.flows.create_flow(serialized_flow=flow.serialize()) return flow_id
def test_environment_execute_with_kwargs(): global_dict = {} @prefect.task def add_to_dict(x): global_dict["result"] = x environment = LocalEnvironment() storage = Memory() with prefect.Flow("test") as flow: x = prefect.Parameter("x") add_to_dict(x) flow_loc = storage.add_flow(flow) environment.execute(storage, flow_loc, x=42) assert global_dict.get("result") == 42
def test_auto_manage_dataframe_default_string(local_file): task = TaskWithInputDataFrame() filename = local_file.file.resolve() with Flow('test_auto_manage_dataframe_default_string') as flow: file = prefect.Parameter('local_file', default=filename) task(input_one=file) with raise_on_exception(), prefect.context(caches={}): flow_state = flow.run() result = list(flow_state.result.values())[0].result df_foo = pd.read_hdf(local_file.file, '/foo') assert isinstance(df_foo, pd.DataFrame) tm.assert_equal(result, df_foo.mean())
async def test_schedule_creates_parametrized_flow_runs(self, ): clock1 = prefect.schedules.clocks.IntervalClock( start_date=pendulum.now("UTC").add(minutes=1), interval=datetime.timedelta(minutes=2), parameter_defaults=dict(x="a"), ) clock2 = prefect.schedules.clocks.IntervalClock( start_date=pendulum.now("UTC"), interval=datetime.timedelta(minutes=2), parameter_defaults=dict(x="b"), ) flow = prefect.Flow( name="Test Scheduled Flow", schedule=prefect.schedules.Schedule(clocks=[clock1, clock2]), ) flow.add_task(prefect.Parameter("x", default=1)) flow_id = await api.flows.create_flow(serialized_flow=flow.serialize()) schedule = await m.Schedule.where({ "flow_id": { "_eq": flow_id } }).first("id") await m.Schedule.where(id=schedule.id ).update(set={ "last_checked": None, "last_scheduled_run_time": None }) await m.FlowRun.where({"flow_id": {"_eq": flow_id}}).delete() assert len(await api.schedules.schedule_flow_runs(schedule.id)) == 10 flow_runs = await m.FlowRun.where({ "flow_id": { "_eq": flow_id } }).get( selection_set={ "parameters": True, "scheduled_start_time": True }, order_by={"scheduled_start_time": EnumValue("asc")}, ) assert all([fr.parameters == dict(x="a") for fr in flow_runs[::2]]) assert all([fr.parameters == dict(x="b") for fr in flow_runs[1::2]])
def test_auto_manage_dataframe_many_inputs(local_file): task = TaskWithTwoInputDataFrames() with Flow('test_auto_manage_dataframe_many_inputs') as flow: file = prefect.Parameter('local_file', default=local_file) task(input_one=file, input_two=file, offset=1.23) with raise_on_exception(), prefect.context(caches={}): flow_state = flow.run() result = list(flow_state.result.values())[0].result df_foo = pd.read_hdf(local_file.file, '/foo') df_bar = pd.read_hdf(local_file.file, '/bar') assert isinstance(df_foo, pd.DataFrame) assert isinstance(df_bar, pd.DataFrame) tm.assert_equal(result, df_foo.mean() + df_bar.mean() + 1.23)
def test_task_failure_caches_inputs_automatically(client): @prefect.task(max_retries=2, retry_delay=timedelta(seconds=100)) def is_p_three(p): if p == 3: raise ValueError("No thank you.") with prefect.Flow("test") as f: p = prefect.Parameter("p") res = is_p_three(p) state = CloudFlowRunner(flow=f).run(return_tasks=[res], parameters=dict(p=3)) assert state.is_running() assert isinstance(state.result[res], Retrying) assert state.result[res].cached_inputs["p"].location == "3" last_state = client.set_task_run_state.call_args_list[-1][-1]["state"] assert isinstance(last_state, Retrying) assert last_state.cached_inputs["p"].location == "3"
async def labeled_flow_id(project_id): flow = prefect.Flow( name="Labeled Flow", run_config=prefect.run_configs.UniversalRun(labels=["foo", "bar"]), schedule=prefect.schedules.IntervalSchedule( start_date=pendulum.datetime(2018, 1, 1), interval=datetime.timedelta(days=1), ), ) flow.add_edge( prefect.Task("t1", tags={"red", "blue"}), prefect.Task("t2", tags={"red", "green"}), ) flow.add_task(prefect.Parameter("x", default=1)) flow_id = await api.flows.create_flow(project_id=project_id, serialized_flow=flow.serialize()) return flow_id
def test_run_workflow_ignores_schedule(test_logger): """ Test that run_workflow ignores the workflow's schedule. """ function_mock = create_autospec(lambda dummy_param: None) # Flow with no more scheduled runs with prefect.Flow( "Dummy_workflow", schedule=CronSchedule("0 0 * * *", end_date=pendulum.now().subtract(days=2)), ) as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) with prefect.context(logger=test_logger): run_workflow.run( parametrised_workflow=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
async def labeled_flow_id(): flow = prefect.Flow( name="Labeled Flow", environment=prefect.environments.execution.remote.RemoteEnvironment( labels=["foo", "bar"]), schedule=prefect.schedules.IntervalSchedule( start_date=pendulum.datetime(2018, 1, 1), interval=datetime.timedelta(days=1), ), ) flow.add_edge( prefect.Task("t1", tags={"red", "blue"}), prefect.Task("t2", tags={"red", "green"}), ) flow.add_task(prefect.Parameter("x", default=1)) flow_id = await api.flows.create_flow(serialized_flow=flow.serialize()) return flow_id
def create_cdc_single_state_flow(): with Flow(CDCCovidDataTracker.__name__) as flow: state = prefect.Parameter("state") connstr = EnvVarSecret("COVID_DB_CONN_URI") sentry_dsn = EnvVarSecret("SENTRY_DSN") sentry_sdk_task = initialize_sentry(sentry_dsn) d = create_scraper(CDCCovidDataTracker, state=state) fetched = fetch(d) normalized = normalize(d) validated = validate(d) done = put(d, connstr) d.set_upstream(sentry_sdk_task) normalized.set_upstream(fetched) validated.set_upstream(normalized) done.set_upstream(validated) return flow
def test_client_deploy_rejects_setting_active_schedules_for_flows_with_req_params( active, monkeypatch): post = MagicMock() monkeypatch.setattr("requests.post", post) with set_temporary_config({ "cloud.graphql": "http://my-cloud.foo", "cloud.auth_token": "secret_token" }): client = Client() flow = prefect.Flow(name="test", schedule=prefect.schedules.Schedule()) flow.add_task(prefect.Parameter("x", required=True)) with pytest.raises(ClientError) as exc: result = client.deploy(flow, project_name="my-default-project", set_schedule_active=active) assert (str( exc.value ) == "Flows with required parameters can not be scheduled automatically.")
async def test_set_schedule_active_handles_flow_group_defaults( self, project_id): flow = prefect.Flow( name="test", tasks=[prefect.Parameter("p", required=True)], schedule=prefect.schedules.IntervalSchedule( start_date=pendulum.now("EST"), interval=datetime.timedelta(minutes=1)), ) flow_id = await api.flows.create_flow( serialized_flow=flow.serialize(), project_id=project_id, set_schedule_active=False, ) # set a default for "p" at the flow group level flow_group = await models.Flow.where(id=flow_id ).first({"flow_group_id"}) await models.FlowGroup.where(id=flow_group.flow_group_id).update( set=dict(default_parameters={"p": 1})) assert await api.flows.set_schedule_active(flow_id=flow_id) is True
def test_run_workflow(test_logger): """ Test that the run_workflow task runs a workflow with the given parameters. """ function_mock = create_autospec(lambda dummy_param: None) with prefect.Flow("Dummy workflow") as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) runner = TaskRunner(task=run_workflow) upstream_edge = Edge(prefect.Task(), run_workflow, key="parametrised_workflow") task_state = runner.run( upstream_states={ upstream_edge: Success(result=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) }, context=dict(logger=test_logger), ) assert task_state.is_successful() function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
def test_run_workflow_fails(test_logger): """ Test that the run_workflow task fails if the workflow fails. """ function_mock = create_autospec(lambda dummy_param: None, side_effect=Exception("Workflow failed")) with prefect.Flow("Dummy workflow") as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) runner = TaskRunner(task=run_workflow) upstream_edge = Edge(prefect.Task(), run_workflow, key="parametrised_workflow") task_state = runner.run( upstream_states={ upstream_edge: Success(result=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) }, context=dict(logger=test_logger), ) assert task_state.is_failed()
@prefect.task def processing1(fp: str): logger = prefect.context.get("logger") logger.info(f"Doing some processing1 on {fp} ...") @prefect.task def processing2(fp: str): logger = prefect.context.get("logger") logger.info(f"Doing some processing2 on {fp} ...") with prefect.Flow("gfs-post-processing", result=PrefectResult()) as flow: fp = prefect.Parameter("fp") p1 = processing1(fp) p2 = processing2(fp) p2.set_upstream(p1) repo_ref = os.getenv("DATAFETCH__STORAGE__REPO__REF", default="master") print(f"Registering Using GitHub repo ref {repo_ref}") flow.storage = GitHub(repo="steph-ben/datafetch-config", ref=repo_ref, path="projects/gfs/post_process.py", secrets=["GITHUB_ACCESS_TOKEN"]) flow.run_config = DockerRun() if __name__ == "__main__": from datafetch.utils import show_prefect_cli_helper
@task def transform(df): df.columns = df.columns.str.strip('() ') return df @task def load(df, extract_name): connect = f"postgresql+psycopg2://%s:%s@%s:{db_port.get()}/%s" % ( db_user.get(), db_pass.get(), db_host.get(), db_db.get()) engine = create_engine(connect) df.to_sql(f'{extract_name}', con=engine, index=False, schema='raw', if_exists='replace') with Flow("ETL") as flow: extract_name = prefect.Parameter('extract_name', default='goodreads') e = extract(extract_name) t = transform(e) l = load(t, extract_name) flow.register(project_name="Quantified Self") flow.run_agent()