def test_edge_equality(): t1 = Task() t2 = Task() assert Edge(t1, t2) == Edge(t1, t2) assert Edge(t1, t2, "key") == Edge(t1, t2, "key") assert Edge(t1, t2, "key", True) == Edge(t1, t2, "key", True) assert Edge(t1, t2) != Edge(t1, t1) assert Edge(t1, t2, "key") != Edge(t1, t2, "other_key") assert Edge(t1, t2, "key", True) != Edge(t1, t2, "key", False)
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.engine.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError( "Some tasks in return_tasks were not found in the flow.") # -- process each task in order with executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant): task_states[task] = task_state = Success(result=task.value) # if the state is finished, don't run the task, just use the provided state if (isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped()): continue upstream_states = { } # type: Dict[Edge, Union[State, Iterable]] # -- process each edge to the task for edge in self.flow.edges_to(task): upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.")) # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant( val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=Result( val, result_handler=ConstantResultHandler(val)), ) # -- run the task with prefect.context(task_full_name=task.name, task_tags=task.tags): task_states[task] = executor.submit( self.run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), task_runner_state_handlers=task_runner_state_handlers, executor=executor, ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union( return_tasks) final_states = executor.wait({ t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks }) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): s.map_states = executor.wait(s.map_states) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set( flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks])) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state
def test_edge_has_tasks_property(): t1 = Task() t2 = TaskWithKey() t3 = Task() edge = Edge(t1, t2, key="a_key") assert edge.tasks == {t1, t2}
def test_serialize_edge(): assert isinstance(EdgeSchema().dump(Edge(Task(), Task())), dict)
def test_object_inequality(): assert Edge(Task(), Task()) != 1
def test_set_upstream_context(self): with Flow(name="test") as f: t1 = Task() t2 = Task() t2.set_upstream(t1) assert Edge(t1, t2) in f.edges
def generate(init, setup, task): return { Edge(Task(), Task(), key="mgr"): init, Edge(Task(), Task(), key="resource"): setup, Edge(Task(), Task()): task, }
def test_unmapped_annotation_takes_precedence(self): e = Edge(edges.unmapped(Task()), Task(), mapped=True) assert e.mapped is False
def test_flat(self): e = Edge(edges.flatten(Task()), Task()) assert e.flattened is True
def test_mapped_kwarg(self): e = Edge(Task(), Task(), mapped=True) assert e.mapped is True
def test_unmapped(self): e = Edge(edges.unmapped(Task()), Task()) assert e.mapped is False
def test_mapped_annotation_takes_precedance_over_kwarg(self): e = Edge(edges.mapped(Task()), Task(), mapped=False) assert e.mapped is True
def test_mapped(self): e = Edge(edges.mapped(Task()), Task()) assert e.mapped is True
def test_none(self): e = Edge(Task(), Task()) assert e.mapped is False assert e.flattened is False
def test_or(self): with Flow(name="test") as f: t1 = Task() t2 = Task() t1 | t2 assert Edge(t1, t2) in f.edges
def test_flat_kwarg(self): e = Edge(Task(), Task(), flattened=True) assert e.flattened is True
def get_flow_run_state( self, state: State, task_states: Dict[Task, State], task_contexts: Dict[Task, Dict[str, Any]], return_tasks: Set[Task], task_runner_state_handlers: Iterable[Callable], executor: "prefect.engine.executors.base.Executor", ) -> State: """ Runs the flow. Args: - state (State): starting state for the Flow. Defaults to `Pending` - task_states (dict): dictionary of task states to begin computation with, with keys being Tasks and values their corresponding state - task_contexts (Dict[Task, Dict[str, Any]]): contexts that will be provided to each task - return_tasks ([Task], optional): list of Tasks to include in the final returned Flow state. Defaults to `None` - task_runner_state_handlers (Iterable[Callable]): A list of state change handlers that will be provided to the task_runner, and called whenever a task changes state. - executor (Executor): executor to use when performing computation; defaults to the executor provided in your prefect configuration Returns: - State: `State` representing the final post-run state of the `Flow`. """ # this dictionary is used for tracking the states of "children" mapped tasks; # when running on Dask, we want to avoid serializing futures, so instead # of storing child task states in the `map_states` attribute we instead store # in this dictionary and only after they are resolved do we attach them to the Mapped state mapped_children = dict() # type: Dict[Task, list] if not state.is_running(): self.logger.info("Flow is not in a Running state.") raise ENDRUN(state) if return_tasks is None: return_tasks = set() if set(return_tasks).difference(self.flow.tasks): raise ValueError( "Some tasks in return_tasks were not found in the flow.") def extra_context(task: Task, task_index: int = None) -> dict: return { "task_name": task.name, "task_tags": task.tags, "task_index": task_index, } # -- process each task in order with executor.start(): for task in self.flow.sorted_tasks(): task_state = task_states.get(task) # if a task is a constant task, we already know its return value # no need to use up resources by running it through a task runner if task_state is None and isinstance( task, prefect.tasks.core.constants.Constant): task_states[task] = task_state = Success(result=task.value) # if the state is finished, don't run the task, just use the provided state # if the state is cached / mapped, we still want to run the task runner pipeline steps # to either ensure the cache is still valid / or to recreate the mapped pipeline for # possible retries if (isinstance(task_state, State) and task_state.is_finished() and not task_state.is_cached() and not task_state.is_mapped()): continue upstream_states = {} # type: Dict[Edge, State] # this dictionary is used exclusively for "reduce" tasks # in particular we store the states / futures corresponding to # the upstream children, and if running on Dask, let Dask resolve them at the appropriate time upstream_mapped_states = {} # type: Dict[Edge, list] # -- process each edge to the task for edge in self.flow.edges_to(task): upstream_states[edge] = task_states.get( edge.upstream_task, Pending(message="Task state not available.")) # this checks whether the task is a "reduce" task for a mapped pipeline # and if so, collects the appropriate upstream children if not edge.mapped and isinstance(upstream_states[edge], Mapped): upstream_mapped_states[edge] = mapped_children.get( edge.upstream_task, []) # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant( val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=ConstantResult(value=val), ) # handle mapped tasks if any([edge.mapped for edge in upstream_states.keys()]): ## wait on upstream states to determine the width of the pipeline ## this is the key to depth-first execution upstream_states = executor.wait( {e: state for e, state in upstream_states.items()}) ## we submit the task to the task runner to determine if ## we can proceed with mapping - if the new task state is not a Mapped ## state then we don't proceed task_states[task] = executor.wait( executor.submit( run_task, task=task, state=task_state, # original state upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers= task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, is_mapped_parent=True, extra_context=extra_context(task), )) ## either way, we should now have enough resolved states to restructure ## the upstream states into a list of upstream state dictionaries to iterate over list_of_upstream_states = prepare_upstream_states_for_mapping( task_states[task], upstream_states, mapped_children) submitted_states = [] for idx, states in enumerate(list_of_upstream_states): ## if we are on a future rerun of a partially complete flow run, ## there might be mapped children in a retrying state; this check ## looks into the current task state's map_states for such info if (isinstance(task_state, Mapped) and len(task_state.map_states) >= idx + 1): current_state = task_state.map_states[ idx] # type: Optional[State] elif isinstance(task_state, Mapped): current_state = None else: current_state = task_state ## this is where each child is submitted for actual work submitted_states.append( executor.submit( run_task, task=task, state=current_state, upstream_states=states, context=dict( prefect.context, **task_contexts.get(task, {}), map_index=idx, ), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers= task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task, task_index=idx), )) if isinstance(task_states.get(task), Mapped): mapped_children[ task] = submitted_states # type: ignore else: task_states[task] = executor.submit( run_task, task=task, state=task_state, upstream_states=upstream_states, context=dict(prefect.context, **task_contexts.get(task, {})), flow_result=self.flow.result, task_runner_cls=self.task_runner_cls, task_runner_state_handlers=task_runner_state_handlers, upstream_mapped_states=upstream_mapped_states, extra_context=extra_context(task), ) # --------------------------------------------- # Collect results # --------------------------------------------- # terminal tasks determine if the flow is finished terminal_tasks = self.flow.terminal_tasks() # reference tasks determine flow state reference_tasks = self.flow.reference_tasks() # wait until all terminal tasks are finished final_tasks = terminal_tasks.union(reference_tasks).union( return_tasks) final_states = executor.wait({ t: task_states.get(t, Pending("Task not evaluated by FlowRunner.")) for t in final_tasks }) # also wait for any children of Mapped tasks to finish, and add them # to the dictionary to determine flow state all_final_states = final_states.copy() for t, s in list(final_states.items()): if s.is_mapped(): # ensure we wait for any mapped children to complete if t in mapped_children: s.map_states = executor.wait(mapped_children[t]) s.result = [ms.result for ms in s.map_states] all_final_states[t] = s.map_states assert isinstance(final_states, dict) key_states = set( flatten_seq([all_final_states[t] for t in reference_tasks])) terminal_states = set( flatten_seq([all_final_states[t] for t in terminal_tasks])) return_states = {t: final_states[t] for t in return_tasks} state = self.determine_final_state( state=state, key_states=key_states, return_states=return_states, terminal_states=terminal_states, ) return state
def test_unmapped_annotation_takes_precedence(self): e = Edge(edges.flatten(Task()), Task(), flattened=False) assert e.flattened is True
def test_set_upstream_with_properties(self, props): with Flow(name="test") as f: t1 = Task() t2 = Task() t2.set_upstream(t1, **props) assert Edge(t1, t2, **props) in f.edges
def test_nested_annotation(self): e = Edge(edges.flatten(edges.mapped(Task())), Task()) assert e.flattened is True assert e.mapped is True
def test_set_downstream(self): f = Flow(name="test") t1 = Task() t2 = Task() t1.set_downstream(t2, flow=f) assert Edge(t1, t2) in f.edges
children = mapped_children.get(edge.upstream_task, []) # if the edge is flattened, then we need to wait for the mapped children # to complete and then flatten them if edge.flattened: children = executors.flatten_mapped_children( mapped_children=children, executor=executor ) upstream_mapped_states[edge] = children # augment edges with upstream constants for key, val in self.flow.constants[task].items(): edge = Edge( upstream_task=prefect.tasks.core.constants.Constant(val), downstream_task=task, key=key, ) upstream_states[edge] = Success( "Auto-generated constant value", result=ConstantResult(value=val), ) # handle mapped tasks <<<<<<< HEAD if any(edge.mapped for edge in upstream_states.keys()): ======= if any([edge.mapped for edge in upstream_states.keys()]): >>>>>>> prefect clone # wait on upstream states to determine the width of the pipeline