def test_task_runner_validates_cached_states_if_task_has_caching(client): @prefect.task( cache_for=datetime.timedelta(minutes=1), result_handler=JSONResultHandler() ) def cached_task(): return 42 state = Cached( cached_result_expiration=datetime.datetime.utcnow() - datetime.timedelta(minutes=2), result=Result(99, JSONResultHandler()), ) old_state = Cached( cached_result_expiration=datetime.datetime.utcnow() - datetime.timedelta(days=1), result=Result(13, JSONResultHandler()), ) client.get_latest_cached_states = MagicMock(return_value=[state, old_state]) res = CloudTaskRunner(task=cached_task).run() assert client.get_latest_cached_states.called assert res.is_successful() assert res.is_cached() assert res.result == 42
def test_state_load_result_reads_if_location_is_provided(self, cls): class MyResult(Result): def read(self, *args, **kwargs): self.value = "bar" return self state = cls(result=Result()) assert state.message is None assert state.result is None assert state._result.location is None new_state = state.load_result(MyResult(location="foo")) assert new_state.message is None assert new_state.result == "bar" assert new_state._result.location == "foo"
def test_state_load_cached_results_doesnt_call_read_if_location_is_none( self, cls): """ If both the value and location information are None, we assume that None is the correct return value and perform no action. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = "bar" return self state = cls(cached_inputs=dict(x=Result())) new_state = state.load_cached_results(dict(x=MyResult())) assert new_state.cached_inputs["x"].value is None assert new_state.cached_inputs["x"].location is None
def test_result_validate_warns_when_run_without_run_validators_flag( self, caplog): _example_function = MagicMock(return_value=True) r = Result(value=None, validators=[_example_function], run_validators=False) with caplog.at_level(logging.WARNING, "prefect.Result"): is_valid = r.validate() # it should have acted normal and called the validate functions _example_function.assert_called_once_with(r) assert is_valid is True # but ALSO it should published a warning log, going on about run_validators not being set assert caplog.text.find("WARNING") > -1 assert caplog.text.find("run_validators") > -1
async def test_set_task_run_state_with_result(self, run_query, task_run_id): result = Result(10, result_handler=JSONResultHandler()) result.store_safe_value() state = Success(result=result) result = await run_query( query=self.mutation, variables=dict(input=dict(states=[ dict(task_run_id=task_run_id, state=state.serialize()) ])), ) tr = await models.TaskRun.where( id=result.data.set_task_run_states.states[0].id ).first({"state", "version"}) assert tr.version == 2 assert tr.state == "Success"
def test_state_load_cached_results_calls_read(self, cls): """ This test ensures that the read logic of the provided result is used instead of self._result; this is important when "hydrating" JSON representations of Results objects that come from Cloud. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = 42 return self state = cls(cached_inputs=dict(x=Result())) new_state = state.load_cached_results(dict(x=MyResult(location=""))) assert new_state.cached_inputs["x"].value == 42 assert new_state.cached_inputs["x"].location == "foo"
def test_state_load_cached_results_doesnt_call_read_if_value_present( self, cls): """ This test ensures that multiple calls to `load_result` will not result in multiple redundant reads from the remote result location. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = "bar" return self state = cls(cached_inputs=dict(x=Result(value=42))) new_state = state.load_cached_results(dict(x=MyResult())) assert new_state.cached_inputs["x"].value == 42 assert new_state.cached_inputs["x"].location is None
def __init__( self, task: Task, state_handlers: Iterable[Callable] = None, flow_result: Result = None, ): self.context = prefect.context.to_dict() self.task = task # Use result from task over the one provided off the parent Flow object if task.result: self.result = task.result else: self.result = Result().copy() if flow_result is None else flow_result.copy() self.flow_result = flow_result super().__init__(state_handlers=state_handlers)
def test_cloud_task_runner_handles_retries_with_queued_states_from_cloud( client): calls = [] def queued_mock(*args, **kwargs): calls.append(kwargs) # first retry attempt will get queued if len(calls) == 4: return Queued() # immediate start time else: return kwargs.get("state") client.set_task_run_state = queued_mock @prefect.task(max_retries=2, retry_delay=datetime.timedelta(seconds=0)) def tagged_task(x): if prefect.context.get("task_run_count", 1) == 1: raise ValueError("gimme a sec") return x upstream_result = Result(value=42, result_handler=JSONResultHandler()) res = CloudTaskRunner(task=tagged_task).run( context={"task_run_version": 1}, state=None, upstream_states={ Edge(Task(), tagged_task, key="x"): Success(result=upstream_result) }, executor=prefect.engine.executors.LocalExecutor(), ) assert res.is_successful() assert res.result == 42 assert (len(calls) == 6 ) # Running -> Failed -> Retrying -> Queued -> Running -> Success assert [type(c["state"]).__name__ for c in calls] == [ "Running", "Failed", "Retrying", "Running", "Running", "Success", ] # ensures result handler was called and persisted assert calls[2]["state"].cached_inputs["x"].safe_value.value == "42"
def test_state_load_result_doesnt_call_read_if_value_present(self, cls): """ This test ensures that multiple calls to `load_result` will not result in multiple redundant reads from the remote result location. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = "bar" return self state = cls(result=Result(value=42)) assert state.message is None assert state.result == 42 new_state = state.load_result(MyResult()) assert new_state.result == 42 assert new_state._result.location is None
def __init__( self, task: Task, state_handlers: Iterable[Callable] = None, flow_result: Result = None, ): self.context = prefect.context.to_dict() self.task = task # if the result was provided off the parent Flow object # we want to use the task's target as the target location if task.result: self.result = task.result else: self.result = Result() if flow_result is None else flow_result if self.task.target: self.result.location = self.task.target self.flow_result = flow_result super().__init__(state_handlers=state_handlers)
def test_task_runner_uses_cached_inputs_from_db_state(monkeypatch): @prefect.task(name="test") def add_one(x): return x + 1 db_state = Retrying(cached_inputs=dict(x=Result(41))) get_task_run_info = MagicMock(return_value=MagicMock(state=db_state)) set_task_run_state = MagicMock() client = MagicMock(get_task_run_info=get_task_run_info, set_task_run_state=set_task_run_state) monkeypatch.setattr("prefect.engine.cloud.task_runner.Client", MagicMock(return_value=client)) res = CloudTaskRunner(task=add_one).run(context={"map_index": 1}) ## assertions assert get_task_run_info.call_count == 1 # one time to pull latest state assert set_task_run_state.call_count == 2 # Pending -> Running -> Success assert res.is_successful() assert res.result == 42
def test_state_load_result_calls_read(self, cls): """ This test ensures that the read logic of the provided result is used instead of self._result; this is important when "hydrating" JSON representations of Results objects that come from Cloud. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = 42 return self state = cls(result=Result(location="")) assert state.message is None assert state.result is None new_state = state.load_result(MyResult(location="")) assert new_state.result == 42 assert new_state._result.location == "foo"
def test_task_failure_caches_constant_inputs_automatically(client): @prefect.task(max_retries=2, retry_delay=timedelta(seconds=100)) def is_p_three(p): if p == 3: raise ValueError("No thank you.") with prefect.Flow("test") as f: res = is_p_three(3) state = CloudFlowRunner(flow=f).run(return_tasks=[res]) assert state.is_running() assert isinstance(state.result[res], Retrying) exp_res = Result(3, result_handler=ConstantResultHandler(3)) assert not state.result[res].cached_inputs["p"] == exp_res exp_res.store_safe_value() assert state.result[res].cached_inputs["p"] == exp_res last_state = client.set_task_run_state.call_args_list[-1][-1]["state"] assert isinstance(last_state, Retrying) assert last_state.cached_inputs["p"] == exp_res
def test_state_load_result_doesnt_call_read_if_location_is_none(self, cls): """ If both the value and location information are None, we assume that None is the correct return value and perform no action. """ class MyResult(Result): def read(self, *args, **kwargs): self.location = "foo" self.value = "bar" return self state = cls(result=Result()) assert state.message is None assert state.result is None assert state._result.location is None new_state = state.load_result(MyResult()) assert new_state.message is None assert new_state.result is None assert new_state._result.location is None
def test_reads_result_using_handler_attribute_if_cached_valid( self, client): class MyResult(Result): def read(self, *args, **kwargs): self.value = 53 return self with pytest.warns(UserWarning): task = Task(cache_validator=duration_only, result=MyResult()) result = PrefectResult(location="2") state = Cached( result=result, cached_result_expiration=pendulum.now("utc").add(minutes=1)) client.get_latest_cached_states = MagicMock(return_value=[]) new = CloudTaskRunner(task).check_task_is_cached( state=state, inputs={"a": Result(1)}) assert new is state assert new.result == 53
def test_state_kwarg_is_prioritized_over_db_caches(self, client): task = Task( cache_for=datetime.timedelta(minutes=1), cache_validator=duration_only, result=PrefectResult(), ) state_a = Cached( result=PrefectResult(location="2"), cached_result_expiration=pendulum.now("utc").add(minutes=1), ) state_b = Cached( result=PrefectResult(location="99"), cached_result_expiration=pendulum.now("utc").add(minutes=1), ) client.get_latest_cached_states = MagicMock(return_value=[state_a]) new = CloudTaskRunner(task).check_task_is_cached( state=state_b, inputs={"a": Result(1)}) assert new is state_b assert new.result == 99
def test_reads_result_if_cached_valid_using_task_result(task, client): class MyResult(Result): def read(self, *args, **kwargs): self.value = 53 return self task = Task( result=MyResult(), cache_for=datetime.timedelta(minutes=1), cache_validator=duration_only, ) state = Cached( result=PrefectResult(location="2"), cached_result_expiration=pendulum.now("utc").add(minutes=1), ) client.get_latest_cached_states = MagicMock(return_value=[state]) new = CloudTaskRunner(task).check_task_is_cached( state=Pending(), inputs={"a": Result(1)}) assert new is state assert new.result == 53
def test_task_runner_handles_looping_with_no_result(client): @prefect.task(result=Result()) def looper(): if prefect.context.get("task_loop_count", 1) < 3: raise LOOP() return 42 res = CloudTaskRunner(task=looper).run(context={"task_run_version": 1}, state=None, upstream_states={}) ## assertions assert res.is_successful() assert client.get_task_run_info.call_count == 0 assert ( client.set_task_run_state.call_count == 6 ) # Pending -> Running -> Looped (1) -> Running -> Looped (2) -> Running -> Success versions = [ call[1]["version"] for call in client.set_task_run_state.call_args_list if call[1]["version"] ] assert versions == [1, 3, 5]
def test_map_skips_dont_leak_out(executor): ll = ListTask() @task def add(x): if x == 1: raise prefect.engine.signals.SKIP("One is no good") else: return x + 1 with Flow(name="test") as f: res = add.map(add.map(ll)) s = f.run(executor=executor) m = s.result[res] assert s.is_successful() assert isinstance(m.map_states, list) assert len(m.result) == 3 assert m.result == [None, 4, 5] assert m.map_states[0].result is None assert m.map_states[0]._result == Result() assert isinstance(m.map_states[0], prefect.engine.state.Skipped)
def test_task_failure_with_upstream_secrets_doesnt_store_secret_value_and_recompute_if_necessary( client, ): @prefect.task(max_retries=2, retry_delay=timedelta(seconds=100)) def is_p_three(p): if p == 3: raise ValueError("No thank you.") return p with prefect.Flow("test", result_handler=JSONResultHandler()) as f: p = prefect.tasks.secrets.Secret("p") res = is_p_three(p) with prefect.context(secrets=dict(p=3)): state = CloudFlowRunner(flow=f).run(return_tasks=[res]) assert state.is_running() assert isinstance(state.result[res], Retrying) exp_res = Result(3, result_handler=SecretResultHandler(p)) assert not state.result[res].cached_inputs["p"] == exp_res exp_res.store_safe_value() assert state.result[res].cached_inputs["p"] == exp_res ## here we set the result of the secret to a saferesult, ensuring ## it will get converted to a "true" result; ## we expect that the upstream value will actually get recomputed from context ## through the SecretResultHandler safe = SafeResult("p", result_handler=SecretResultHandler(p)) state.result[p] = Success(result=safe) state.result[res].start_time = pendulum.now("utc") state.result[res].cached_inputs = dict(p=safe) with prefect.context(secrets=dict(p=4)): new_state = CloudFlowRunner(flow=f).run(return_tasks=[res], task_states=state.result) assert new_state.is_successful() assert new_state.result[res].result == 4
def test_retries_ignore_cached_inputs_if_upstream_results_are_available( self, executor ): with Flow(name="test") as f: a = CountTask() b = ReturnTask(max_retries=1, retry_delay=datetime.timedelta(0)) a_res = a() b_res = b(a_res) first_state = FlowRunner(flow=f).run(executor=executor, return_tasks=f.tasks) assert first_state.is_running() a_state = first_state.result[a_res] a_state.result = 100 # modify the result b_state = first_state.result[b_res] b_state.cached_inputs = dict(x=Result(2)) # artificially alter state with raise_on_exception(): # without caching we'd expect a KeyError second_state = FlowRunner(flow=f).run( executor=executor, return_tasks=[b_res], task_states=first_state.result ) assert isinstance(second_state, Success) assert second_state.result[b_res].result == 1 / 99
def test_retries_use_cached_inputs(self, executor): with Flow(name="test") as f: a = CountTask() b = ReturnTask(max_retries=1, retry_delay=datetime.timedelta(0)) a_res = a() b_res = b(a_res) first_state = FlowRunner(flow=f).run(executor=executor, return_tasks=f.tasks) assert first_state.is_running() a_state = first_state.result[a_res] a_state.result = ( NoResult ) # remove the result to see if the cached results are picked up b_state = first_state.result[b_res] b_state.cached_inputs = dict(x=Result(2)) # artificially alter state with raise_on_exception(): # without caching we'd expect a KeyError second_state = FlowRunner(flow=f).run( executor=executor, return_tasks=[b_res], task_states=first_state.result ) assert isinstance(second_state, Success) assert second_state.result[b_res].result == 1
def test_task_runner_treats_unfound_files_as_invalid_caches(client, tmpdir): @prefect.task(cache_for=datetime.timedelta(minutes=1), result_handler=JSONResultHandler()) def cached_task(): return 42 state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(minutes=2), result=LocalResult(location=str(tmpdir / "made_up_data.prefect")), ) old_state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(days=1), result=Result(13, JSONResultHandler()), ) client.get_latest_cached_states = MagicMock( return_value=[state, old_state]) res = CloudTaskRunner(task=cached_task).run() assert client.get_latest_cached_states.called assert res.is_successful() assert res.is_cached() assert res.result == 13
def test_task_runner_validates_cached_state_inputs_if_task_has_caching_and_uses_task_handler( client, ): class Handler(ResultHandler): def read(self, val): return 1337 @prefect.task( cache_for=datetime.timedelta(minutes=1), cache_validator=all_inputs, result_handler=Handler(), ) def cached_task(x): return 42 dull_state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(minutes=2), result=SafeResult("-1", JSONResultHandler()), ) state = Cached( cached_result_expiration=datetime.datetime.utcnow() + datetime.timedelta(minutes=2), result=SafeResult("99", JSONResultHandler()), cached_inputs={ "x": SafeResult("2", result_handler=JSONResultHandler()) }, ) client.get_latest_cached_states = MagicMock( return_value=[dull_state, state]) res = CloudTaskRunner(task=cached_task).check_task_is_cached( Pending(), inputs={"x": Result(2, result_handler=JSONResultHandler())}) assert client.get_latest_cached_states.called assert res.is_successful() assert res.is_cached() assert res.result == 1337
def get_task_run_state( self, state: State, inputs: Dict[str, Result], timeout_handler: Optional[Callable] = None, ) -> State: """ Runs the task and traps any signals or errors it raises. Also checkpoints the result of a successful task, if `task.checkpoint` is `True`. Args: - state (State): the current state of this task - inputs (Dict[str, Result], optional): a dictionary of inputs whose keys correspond to the task's `run()` arguments. - timeout_handler (Callable, optional): function for timing out task execution, with call signature `handler(fn, *args, **kwargs)`. Defaults to `prefect.utilities.executors.timeout_handler` Returns: - State: the state of the task after running the check Raises: - signals.PAUSE: if the task raises PAUSE - ENDRUN: if the task is not ready to run """ if not state.is_running(): self.logger.debug( "Task '{name}': can't run task because it's not in a " "Running state; ending run.".format( name=prefect.context.get("task_full_name", self.task.name) ) ) raise ENDRUN(state) try: self.logger.debug( "Task '{name}': Calling task.run() method...".format( name=prefect.context.get("task_full_name", self.task.name) ) ) timeout_handler = ( timeout_handler or prefect.utilities.executors.timeout_handler ) raw_inputs = {k: r.value for k, r in inputs.items()} result = timeout_handler( self.task.run, timeout=self.task.timeout, **raw_inputs ) except KeyboardInterrupt: self.logger.debug("Interrupt signal raised, cancelling task run.") state = Cancelled(message="Interrupt signal raised, cancelling task run.") return state # inform user of timeout except TimeoutError as exc: if prefect.context.get("raise_on_exception"): raise exc state = TimedOut( "Task timed out during execution.", result=exc, cached_inputs=inputs ) return state except signals.LOOP as exc: new_state = exc.state assert isinstance(new_state, Looped) new_state.result = Result( value=new_state.result, result_handler=self.result_handler ) new_state.message = exc.state.message or "Task is looping ({})".format( new_state.loop_count ) return new_state result = Result(value=result, result_handler=self.result_handler) state = Success(result=result, message="Task run succeeded.") ## only checkpoint tasks if checkpointing is turned on if ( state.is_successful() and prefect.context.get("checkpointing") is True and self.task.checkpoint is True ): state._result.store_safe_value() return state
def run_mapped_task( self, state: State, upstream_states: Dict[Edge, State], context: Dict[str, Any], executor: "prefect.engine.executors.Executor", ) -> State: """ If the task is being mapped, submits children tasks for execution. Returns a `Mapped` state. Args: - state (State): the current task state - upstream_states (Dict[Edge, State]): the upstream states - context (dict, optional): prefect Context to use for execution - executor (Executor): executor to use when performing computation Returns: - State: the state of the task after running the check Raises: - ENDRUN: if the current state is not `Running` """ map_upstream_states = [] # we don't know how long the iterables are, but we want to iterate until we reach # the end of the shortest one counter = itertools.count() # infinite loop, if upstream_states has any entries while True and upstream_states: i = next(counter) states = {} try: for edge, upstream_state in upstream_states.items(): # if the edge is not mapped over, then we take its state if not edge.mapped: states[edge] = upstream_state # if the edge is mapped and the upstream state is Mapped, then we are mapping # over a mapped task. In this case, we take the appropriately-indexed upstream # state from the upstream tasks's `Mapped.map_states` array. # Note that these "states" might actually be futures at this time; we aren't # blocking until they finish. elif edge.mapped and upstream_state.is_mapped(): states[edge] = upstream_state.map_states[i] # type: ignore # Otherwise, we are mapping over the result of a "vanilla" task. In this # case, we create a copy of the upstream state but set the result to the # appropriately-indexed item from the upstream task's `State.result` # array. else: states[edge] = copy.copy(upstream_state) # if the current state is already Mapped, then we might be executing # a re-run of the mapping pipeline. In that case, the upstream states # might not have `result` attributes (as any required results could be # in the `cached_inputs` attribute of one of the child states). # Therefore, we only try to get a result if EITHER this task's # state is not already mapped OR the upstream result is not None. if not state.is_mapped() or upstream_state._result != NoResult: upstream_result = Result( upstream_state.result[i], result_handler=upstream_state._result.result_handler, # type: ignore ) states[edge].result = upstream_result elif state.is_mapped(): if i >= len(state.map_states): # type: ignore raise IndexError() # only add this iteration if we made it through all iterables map_upstream_states.append(states) # index error means we reached the end of the shortest iterable except IndexError: break def run_fn( state: State, map_index: int, upstream_states: Dict[Edge, State] ) -> State: map_context = context.copy() map_context.update(map_index=map_index) with prefect.context(self.context): return self.run( upstream_states=upstream_states, # if we set the state here, then it will not be processed by `initialize_run()` state=state, context=map_context, executor=executor, ) # generate initial states, if available if isinstance(state, Mapped): initial_states = list(state.map_states) # type: List[Optional[State]] else: initial_states = [] initial_states.extend([None] * (len(map_upstream_states) - len(initial_states))) current_state = Mapped( message="Preparing to submit {} mapped tasks.".format(len(initial_states)), map_states=initial_states, # type: ignore ) state = self.handle_state_change(old_state=state, new_state=current_state) if state is not current_state: return state # map over the initial states, a counter representing the map_index, and also the mapped upstream states map_states = executor.map( run_fn, initial_states, range(len(map_upstream_states)), map_upstream_states ) self.logger.debug( "{} mapped tasks submitted for execution.".format(len(map_states)) ) new_state = Mapped( message="Mapped tasks submitted for execution.", map_states=map_states ) return self.handle_state_change(old_state=state, new_state=new_state)
def test_task_map_with_no_upstream_results_and_a_mapped_state(executor): """ This test makes sure that mapped tasks properly generate children tasks even when run multiple times and without available upstream results. In this test, we run the pipeline from a variety of starting points, ensuring that some upstream results are unavailable and checking that children pipelines are properly regenerated. """ @prefect.task def numbers(): return [1, 2, 3] @prefect.task def plus_one(x): return x + 1 @prefect.task def get_sum(x): return sum(x) with Flow(name="test") as f: n = numbers() x = plus_one.map(n) y = plus_one.map(x) s = get_sum(y) # first run with a missing result from `n` but map_states for `x` state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[ Pending(cached_inputs={"x": Result(i)}) for i in range(1, 4) ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12 # next run with missing results for n and x state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[Success(), Success(), Success()]), y: Mapped(map_states=[ Success(result=3), Success(result=4), Retrying(cached_inputs={"x": Result(4)}), ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12 # next run with missing results for n, x, and y state = FlowRunner(flow=f).run( executor=executor, task_states={ n: Success(), x: Mapped(map_states=[Success(), Success(), Success()]), y: Mapped(map_states=[ Success(result=3), Success(result=4), Success(result=5) ]), }, return_tasks=f.tasks, ) assert state.is_successful() assert state.result[s].result == 12
def test_preparing_state_for_cloud_fails_if_cached_inputs_have_no_handler(cls): xres = Result(3, result_handler=None) with pytest.raises(AssertionError, match="no ResultHandler"): state = prepare_state_for_cloud(cls(cached_inputs=dict(x=xres)))
def test_preparing_state_for_cloud_replaces_cached_inputs_with_safe(cls): xres = Result(3, result_handler=JSONResultHandler()) state = prepare_state_for_cloud(cls(cached_inputs=dict(x=xres))) assert isinstance(state, cls) assert state.result == NoResult assert state.cached_inputs == dict(x=xres)