def test_cloud_task_runner_handles_retries_with_queued_states_from_cloud(client): calls = [] def queued_mock(*args, **kwargs): calls.append(kwargs) # first retry attempt will get queued if len(calls) == 4: return Queued() # immediate start time else: return kwargs.get("state") client.set_task_run_state = queued_mock @prefect.task( max_retries=2, retry_delay=datetime.timedelta(seconds=0), result=PrefectResult(), ) def tagged_task(x): if prefect.context.get("task_run_count", 1) == 1: raise ValueError("gimme a sec") return x upstream_result = PrefectResult(value=42, location="42") res = CloudTaskRunner(task=tagged_task).run( context={"task_run_version": 1}, state=None, upstream_states={ Edge(Task(), tagged_task, key="x"): Success(result=upstream_result) }, executor=prefect.engine.executors.LocalExecutor(), ) assert res.is_successful() assert res.result == 42 assert ( len(calls) == 6 ) # Running -> Failed -> Retrying -> Queued -> Running -> Success assert [type(c["state"]).__name__ for c in calls] == [ "Running", "Failed", "Retrying", "Running", "Running", "Success", ] assert calls[2]["state"].cached_inputs["x"].value == 42
def test_task_runner_sets_mapped_state_prior_to_executor_mapping(client): upstream_states = { Edge(Task(), Task(), key="foo", mapped=True): Success(result=[1, 2]) } with pytest.raises(ENDRUN) as exc: CloudTaskRunner(task=Task()).check_task_ready_to_map( state=Pending(), upstream_states=upstream_states) ## assertions assert client.get_task_run_info.call_count == 0 # never called assert client.set_task_run_state.call_count == 1 # Pending -> Mapped assert client.get_latest_cached_states.call_count == 0 last_set_state = client.set_task_run_state.call_args_list[-1][1]["state"] assert last_set_state.is_mapped()
def test_load_results_from_upstream_reads_cached_inputs_using_upstream_results( self, ): class CustomResult(Result): def read(self, *args, **kwargs): self.value = 99 return self result = PrefectResult(location="1") state = Pending(cached_inputs=dict(x=result)) edge = Edge(Task(result=CustomResult()), 2, key="x") new_state, upstreams = CloudTaskRunner( task=Task(result=PrefectResult()) ).load_results(state=state, upstream_states={edge: Success(result=result)}) assert new_state.cached_inputs["x"].value == 99
def test_version_to_date(self): version_to_date = VersionToDateTask() runner = TaskRunner(task=version_to_date) edge = Edge(Task(), version_to_date, key='version') upstream_state = Success(result=ConstantResult( value=Version('1.0.0+2021-03-03T00.stinky-fish'))) with raise_on_exception(): with prefect.context(): state = runner.run(upstream_states={edge: upstream_state}) if state.is_failed(): print(state) self.fail() self.assertEqual(state.result, pendulum.parse('2021-03-03T00'))
def test_task_runner_uses_upstream_result_handlers(client): class MyResult(Result): def read(self, *args, **kwargs): self.value = "cool" return self def write(self, *args, **kwargs): return self @prefect.task(result=PrefectResult()) def t(x): return x success = Success(result=PrefectResult(location="1")) upstream_states = {Edge(Task(result=MyResult()), t, key="x"): success} state = CloudTaskRunner(task=t).run(upstream_states=upstream_states) assert state.is_successful() assert state.result == "cool"
def test_edge_key_must_be_valid(): assert Edge(Task(), Task(), key=None) assert Edge(Task(), Task(), key="test") assert Edge(Task(), Task(), key="test_underscore") # int key is not allowed with pytest.raises(ValueError): Edge(Task(), Task(), key=1) with pytest.raises(ValueError): Edge(Task(), Task(), key=Task()) with pytest.raises(ValueError): Edge(Task(), Task(), key="name with space") with pytest.raises(ValueError): Edge(Task(), Task(), key="5number") with pytest.raises(ValueError): Edge(Task(), Task(), key="this.that")
def test_task_runner_gracefully_handles_load_results_failures(client): class MyResult(Result): def read(self, *args, **kwargs): raise TypeError("something is wrong!") @prefect.task(result=PrefectResult()) def t(x): return x success = Success(result=MyResult(location="foo.txt")) upstream_states = {Edge(Task(result=MyResult()), t, key="x"): success} state = CloudTaskRunner(task=t).run(upstream_states=upstream_states) assert state.is_failed() assert "task results" in state.message assert client.set_task_run_state.call_count == 1 # Pending -> Failed states = [call[1]["state"] for call in client.set_task_run_state.call_args_list] assert [type(s).__name__ for s in states] == ["Failed"]
def test_cloud_task_runner_sends_heartbeat_on_queued_retries(client): calls = [] tr_ids = [] def queued_mock(*args, **kwargs): calls.append(kwargs) # first retry attempt will get queued if len(calls) == 4: return Queued() # immediate start time else: return kwargs.get("state") def mock_heartbeat(**kwargs): tr_ids.append(kwargs.get("task_run_id")) client.set_task_run_state = queued_mock client.update_task_run_heartbeat = mock_heartbeat @prefect.task( max_retries=2, retry_delay=datetime.timedelta(seconds=0), result=PrefectResult(), ) def tagged_task(x): if prefect.context.get("task_run_count", 1) == 1: raise ValueError("gimme a sec") return x upstream_result = PrefectResult(value=42, location="42") CloudTaskRunner(task=tagged_task).run( context={ "task_run_version": 1, "task_run_id": "id" }, state=None, upstream_states={ Edge(Task(), tagged_task, key="x"): Success(result=upstream_result) }, ) assert len(calls) == 6 assert tr_ids == ["id", "id"]
def test_can_import_df(self): checkout = SemanticCheckoutTask(upstream_repos=dict( abc='sgr:///abc/1234?tag=1', ), ) workspaces = checkout.run() df_to_table = DataFrameToTableRequestTask( repo_uri='sgr:///abc/1234?tag=1', ) runner = TaskRunner(task=df_to_table) df_edge = Edge(Task(), df_to_table, key='request') upstream_state = Success(result=ConstantResult( value=DataFrameToTableRequest(data_frame=fake_data(10), table='footable1'))) with raise_on_exception(): with prefect.context(): state = runner.run(upstream_states={df_edge: upstream_state}) if state.is_failed(): print(state) self.fail() self.assertTrue(table_exists_at(self.repo, 'footable1'))
def test_skip_if_already_run(monkeypatch, test_logger, state, is_skipped): """ Test that the skip_if_already_run task skips if the workflow's most recent state is 'running' or 'success', and does not skip if the state is None (i.e. not run before) or 'failed'. """ get_session_mock = Mock() get_most_recent_state_mock = Mock(return_value=state) monkeypatch.setattr("autoflow.utils.get_session", get_session_mock) monkeypatch.setattr("autoflow.sensor.WorkflowRuns.get_most_recent_state", get_most_recent_state_mock) runner = TaskRunner(task=skip_if_already_run) upstream_edge = Edge(prefect.Task(), skip_if_already_run, key="parametrised_workflow") with set_temporary_config({"db_uri": "DUMMY_DB_URI"}): task_state = runner.run( upstream_states={ upstream_edge: Success(result=( prefect.Flow(name="DUMMY_WORFLOW_NAME"), { "DUMMY_PARAM": "DUMMY_VALUE" }, )) }, context=dict(logger=test_logger), ) get_session_mock.assert_called_once_with("DUMMY_DB_URI") get_most_recent_state_mock.assert_called_once_with( workflow_name="DUMMY_WORFLOW_NAME", parameters={"DUMMY_PARAM": "DUMMY_VALUE"}, session=get_session_mock.return_value, ) assert task_state.is_successful() assert is_skipped == task_state.is_skipped()
def test_run_workflow(test_logger): """ Test that the run_workflow task runs a workflow with the given parameters. """ function_mock = create_autospec(lambda dummy_param: None) with prefect.Flow("Dummy workflow") as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) runner = TaskRunner(task=run_workflow) upstream_edge = Edge(prefect.Task(), run_workflow, key="parametrised_workflow") task_state = runner.run( upstream_states={ upstream_edge: Success(result=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) }, context=dict(logger=test_logger), ) assert task_state.is_successful() function_mock.assert_called_once_with(dummy_param="DUMMY_VALUE")
def test_run_workflow_fails(test_logger): """ Test that the run_workflow task fails if the workflow fails. """ function_mock = create_autospec(lambda dummy_param: None, side_effect=Exception("Workflow failed")) with prefect.Flow("Dummy workflow") as dummy_workflow: dummy_param = prefect.Parameter("dummy_param") FunctionTask(function_mock)(dummy_param=dummy_param) runner = TaskRunner(task=run_workflow) upstream_edge = Edge(prefect.Task(), run_workflow, key="parametrised_workflow") task_state = runner.run( upstream_states={ upstream_edge: Success(result=(dummy_workflow, dict(dummy_param="DUMMY_VALUE"))) }, context=dict(logger=test_logger), ) assert task_state.is_failed()
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ # this dictionary is used for tracking the states of "children" mapped tasks; # when running on Dask, we want to avoid serializing futures, so instead # of storing child task states in the `map_states` attribute we instead store # in this dictionary and only after they are resolved do we attach them to the Mapped state mapped_children = dict() # type: Dict[Task, list] if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError("Some tasks in return_tasks were not found in the flow.") def extra_context(task: Task, task_index: int = None) -> dict: return { "task_name": task.name, "task_tags": task.tags, "task_index": task_index, } # -- process each task in order with self.check_for_cancellation(), executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) # if a task is a constant task, we already know its return value # no need to use up resources by running it through a task runner if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant ): task_states[task] = task_state = Success(result=task.value) # Always restart completed resource setup/cleanup tasks and # secret tasks unless they were explicitly cached. # TODO: we only need to rerun these tasks if any pending # downstream tasks depend on them. if ( isinstance( task, ( prefect.tasks.core.resource_manager.ResourceSetupTask, prefect.tasks.core.resource_manager.ResourceCleanupTask, prefect.tasks.secrets.SecretBase, ), ) and task_state is not None and task_state.is_finished() and not task_state.is_cached() ): task_states[task] = task_state = Pending() # if the state is finished, don't run the task, just use the provided state if # the state is cached / mapped, we still want to run the task runner pipeline # steps to either ensure the cache is still valid / or to recreate the mapped # pipeline for possible retries if ( isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped() ): continue upstream_states = {} # type: Dict[Edge, State] # this dictionary is used exclusively for "reduce" tasks in particular we store # the states / futures corresponding to the upstream children, and if running # on Dask, let Dask resolve them at the appropriate time. # Note: this is an optimization that allows Dask to resolve the mapped # dependencies by "elevating" them to a function argument. upstream_mapped_states = {} # type: Dict[Edge, list] # -- process each edge to the task for edge in self.flow.edges_to(task): # load the upstream task states (supplying Pending as a default) upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.") ) # if the edge is flattened and not the result of a map, then we # preprocess the upstream states. If it IS the result of a # map, it will be handled in `prepare_upstream_states_for_mapping` if edge.flattened: if not isinstance(upstream_states[edge], Mapped): upstream_states[edge] = executor.submit( executors.flatten_upstream_state, upstream_states[edge] ) # this checks whether the task is a "reduce" task for a mapped pipeline # and if so, collects the appropriate upstream children if not edge.mapped and isinstance(upstream_states[edge], Mapped): children = mapped_children.get(edge.upstream_task, []) # if the edge is flattened, then we need to wait for the mapped children # to complete and then flatten them if edge.flattened: children = executors.flatten_mapped_children( mapped_children=children, executor=executor ) upstream_mapped_states[edge] = children # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant(val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=ConstantResult(value=val), ) # handle mapped tasks if any(edge.mapped for edge in upstream_states.keys()): # wait on upstream states to determine the width of the pipeline # this is the key to depth-first execution upstream_states = executor.wait( {e: state for e, state in upstream_states.items()} ) # we submit the task to the task runner to determine if # we can proceed with mapping - if the new task state is not a Mapped # state then we don't proceed task_states[task] = executor.wait( executor.submit( run_task, task=task, state=task_state, # original state upstream_states=upstream_states, context=dict( prefect.context, **task_contexts.get(task, {}) ), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers=task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, is_mapped_parent=True, extra_context=extra_context(task), ) ) # either way, we should now have enough resolved states to restructure # the upstream states into a list of upstream state dictionaries to iterate over list_of_upstream_states = ( executors.prepare_upstream_states_for_mapping( task_states[task], upstream_states, mapped_children, executor=executor, ) ) submitted_states = [] for idx, states in enumerate(list_of_upstream_states): # if we are on a future rerun of a partially complete flow run, # there might be mapped children in a retrying state; this check # looks into the current task state's map_states for such info if ( isinstance(task_state, Mapped) and len(task_state.map_states) >= idx + 1 ): current_state = task_state.map_states[ idx ] # type: Optional[State] elif isinstance(task_state, Mapped): current_state = None else: current_state = task_state # this is where each child is submitted for actual work submitted_states.append( executor.submit( run_task, task=task, state=current_state, upstream_states=states, context=dict( prefect.context, **task_contexts.get(task, {}), map_index=idx, ), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers=task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task, task_index=idx), ) ) if isinstance(task_states.get(task), Mapped): mapped_children[task] = submitted_states # type: ignore else: task_states[task] = executor.submit( run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers=task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task), ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union(return_tasks) final_states = executor.wait( { t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks } ) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): # ensure we wait for any mapped children to complete if t in mapped_children: s.map_states = executor.wait(mapped_children[t]) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set(flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks]) ) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state
def test_mapped_kwarg(self): e = Edge(Task(), Task(), mapped=True) assert e.mapped is True
def test_object_inequality(): assert Edge(Task(), Task()) != 1
def test_edge_equality(): t1 = Task() t2 = Task() assert Edge(t1, t2) == Edge(t1, t2) assert Edge(t1, t2, "key") == Edge(t1, t2, "key") assert Edge(t1, t2, "key", True) == Edge(t1, t2, "key", True) assert Edge(t1, t2) != Edge(t2, t1) assert Edge(t1, t2, "key") != Edge(t1, t2, "other_key") assert Edge(t1, t2, "key", True) != Edge(t1, t2, "key", False)
def test_edge_has_tasks_property(): t1 = Task() t2 = TaskWithKey() t3 = Task() edge = Edge(t1, t2, key="a_key") assert edge.tasks == {t1, t2}
def test_unmapped(self): e = Edge(edges.unmapped(Task()), Task()) assert e.mapped is False
def test_nested_annotation(self): e = Edge(edges.flatten(edges.mapped(Task())), Task()) assert e.flattened is True assert e.mapped is True
def test_serialize_edge(): assert isinstance(EdgeSchema().dump(Edge(Task(), Task())), dict)
def test_or(self): with Flow(name="test") as f: t1 = Task() t2 = Task() t1 | t2 assert Edge(t1, t2) in f.edges
def test_flat(self): e = Edge(edges.flatten(Task()), Task()) assert e.flattened is True
def test_set_upstream_context(self): with Flow(name="test") as f: t1 = Task() t2 = Task() t2.set_upstream(t1) assert Edge(t1, t2) in f.edges
def test_flat_kwarg(self): e = Edge(Task(), Task(), flattened=True) assert e.flattened is True
def generate(init, setup, task): return { Edge(Task(), Task(), key="mgr"): init, Edge(Task(), Task(), key="resource"): setup, Edge(Task(), Task()): task, }
def test_unmapped_annotation_takes_precedence(self): e = Edge(edges.unmapped(Task()), Task(), mapped=True) assert e.mapped is False
def test_set_upstream(self): f = Flow(name="test") t1 = Task() t2 = Task() t2.set_upstream(t1, flow=f) assert Edge(t1, t2) in f.edges
def test_unmapped_annotation_takes_precedence(self): e = Edge(edges.flatten(Task()), Task(), flattened=False) assert e.flattened is True
def test_set_upstream_with_properties(self, props): with Flow(name="test") as f: t1 = Task() t2 = Task() t2.set_upstream(t1, **props) assert Edge(t1, t2, **props) in f.edges
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.engine.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError( "Some tasks in return_tasks were not found in the flow.") # -- process each task in order with executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant): task_states[task] = task_state = Success(result=task.value) # if the state is finished, don't run the task, just use the provided state if (isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped()): continue upstream_states = { } # type: Dict[Edge, Union[State, Iterable]] # -- process each edge to the task for edge in self.flow.edges_to(task): upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.")) # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant( val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=ConstantResult(value=val), ) # -- run the task with prefect.context(task_full_name=task.name, task_tags=task.tags): task_states[task] = executor.submit( self.run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), task_runner_state_handlers=task_runner_state_handlers, executor=executor, ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union( return_tasks) final_states = executor.wait({ t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks }) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): s.map_states = executor.wait(s.map_states) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set( flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks])) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state