def assign_id_and_return(self, id: str): if self.id: raise _user_exceptions.FlyteAssertion( f"Error assigning ID: {id} because {self} is already assigned. Has this node been ssigned to another " "workflow already?") self._id = _dnsify(id) if id else None self._metadata.name = id return self
def __init__(self, id, upstream_nodes, bindings, metadata, sdk_task=None, sdk_workflow=None, sdk_launch_plan=None, sdk_branch=None): """ :param Text id: A workflow-level unique identifier that identifies this node in the workflow. "inputs" and "outputs" are reserved node ids that cannot be used by other nodes. :param flytekit.models.core.workflow.NodeMetadata metadata: Extra metadata about the node. :param list[flytekit.models.literals.Binding] bindings: Specifies how to bind the underlying interface's inputs. All required inputs specified in the underlying interface must be fulfilled. :param list[SdkNode] upstream_nodes: Specifies execution dependencies for this node ensuring it will only get scheduled to run after all its upstream nodes have completed. This node will have an implicit dependency on any node that appears in inputs field. :param flytekit.common.tasks.task.SdkTask sdk_task: The task to execute in this node. :param flytekit.common.workflow.SdkWorkflow sdk_workflow: The workflow to execute in this node. :param flytekit.common.launch_plan.SdkLaunchPlan sdk_launch_plan: The launch plan to execute in this node. :param TODO sdk_branch: TODO """ non_none_entities = [ entity for entity in [sdk_workflow, sdk_branch, sdk_launch_plan, sdk_task] if entity is not None ] if len(non_none_entities) != 1: raise _user_exceptions.FlyteAssertion( "An SDK node must have one underlying entity specified at once. Received the following " "entities: {}".format(non_none_entities)) workflow_node = None if sdk_workflow is not None: workflow_node = _component_nodes.SdkWorkflowNode( sdk_workflow=sdk_workflow) elif sdk_launch_plan is not None: workflow_node = _component_nodes.SdkWorkflowNode( sdk_launch_plan=sdk_launch_plan) super(SdkNode, self).__init__( id=_dnsify(id) if id else None, metadata=metadata, inputs=bindings, upstream_node_ids=[n.id for n in upstream_nodes], output_aliases=[], # TODO: Are aliases a thing in SDK nodes task_node=_component_nodes.SdkTaskNode(sdk_task) if sdk_task else None, workflow_node=workflow_node, branch_node=sdk_branch.target if sdk_branch else None) self._upstream = upstream_nodes self._executable_sdk_object = sdk_task or sdk_workflow or sdk_branch or sdk_launch_plan self._outputs = OutputParameterMapper( self._executable_sdk_object.interface.outputs, self)
def assign_id_and_return(self, id): """ :param Text id: :rtype: None """ if self.id: raise _user_exceptions.FlyteAssertion( "Error assigning ID: {} because {} is already assigned. Has this node been assigned to another " "workflow already?".format(id, self)) self._id = _dnsify(id) if id else None return self
def __init__( self, id: str, metadata: _workflow_model.NodeMetadata, bindings: List[_literal_models.Binding], upstream_nodes: List[Node], flyte_entity: Any, ): self._id = _dnsify(id) self._metadata = metadata self._bindings = bindings self._upstream_nodes = upstream_nodes self._flyte_entity = flyte_entity self._sdk_node = None self._aliases: _workflow_model.Alias = None
def __init__( self, id: str, metadata: _workflow_model.NodeMetadata, bindings: List[_literal_models.Binding], upstream_nodes: List[Node], flyte_entity: Any, ): if id is None: raise ValueError("Illegal construction of node, without a Node ID") self._id = _dnsify(id) self._metadata = metadata self._bindings = bindings self._upstream_nodes = upstream_nodes self._flyte_entity = flyte_entity self._aliases: _workflow_model.Alias = None self._outputs = None
def __init__( self, id, upstream_nodes, bindings, metadata, flyte_task: "flytekit.control_plan.tasks.task.FlyteTask" = None, flyte_workflow: "flytekit.control_plane.workflow.FlyteWorkflow" = None, flyte_launch_plan=None, flyte_branch=None, parameter_mapping=True, ): non_none_entities = list( filter( None, [flyte_task, flyte_workflow, flyte_launch_plan, flyte_branch])) if len(non_none_entities) != 1: raise _user_exceptions.FlyteAssertion( "An Flyte node must have one underlying entity specified at once. Received the following " "entities: {}".format(non_none_entities)) workflow_node = None if flyte_workflow is not None: workflow_node = _component_nodes.FlyteWorkflowNode( flyte_workflow=flyte_workflow) elif flyte_launch_plan is not None: workflow_node = _component_nodes.FlyteWorkflowNode( flyte_launch_plan=flyte_launch_plan) super(FlyteNode, self).__init__( id=_dnsify(id) if id else None, metadata=metadata, inputs=bindings, upstream_node_ids=[n.id for n in upstream_nodes], output_aliases=[], task_node=_component_nodes.FlyteTaskNode(flyte_task) if flyte_task else None, workflow_node=workflow_node, branch_node=flyte_branch, ) self._upstream = upstream_nodes
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity]) """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: PromiseOutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } inputs_dict.update(outputs_dict) yielded_sub_tasks = [ sub_task for sub_task in super(SdkDynamicTask, self)._execute_user_code( context, inputs_dict) or [] ] upstream_nodes = list() output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.raw_value, upstream_nodes=upstream_nodes)) for name, b in _six.iteritems(outputs_dict) ] upstream_nodes = set(upstream_nodes) generated_files = {} # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless. array_job_index = {} tasks = [] nodes = [] visited_nodes = set() generated_ids = {} effective_failure_ratio = self._allowed_failure_ratio or 0.0 for sub_task_node in _itertools.chain(yielded_sub_tasks, upstream_nodes): if sub_task_node in visited_nodes: continue visited_nodes.add(sub_task_node) # Generate an id that's unique in the document (if the same task is used multiple times with # different resources, executable_sdk_object.id will be the same but generated node_ids should not # be. safe_task_id = _six.text_type( sub_task_node.executable_sdk_object.id) if safe_task_id in generated_ids: new_count = generated_ids[ safe_task_id] = generated_ids[safe_task_id] + 1 else: new_count = generated_ids[safe_task_id] = 0 unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count)) # If the task can run as an array job, group its instances together. Otherwise, keep each invocation as a # separate node. if SdkDynamicTask._can_run_as_array( sub_task_node.executable_sdk_object.type): if sub_task_node.executable_sdk_object in array_job_index: array_job, node = array_job_index[ sub_task_node.executable_sdk_object] array_job.size += 1 array_job.min_successes = int( math.ceil( (1 - effective_failure_ratio) * array_job.size)) else: array_job = self._create_array_job( inputs_prefix=unique_node_id) node = sub_task_node.assign_id_and_return(unique_node_id) array_job_index[sub_task_node.executable_sdk_object] = ( array_job, node) node_index = _six.text_type(array_job.size - 1) for k, node_output in _six.iteritems(sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) node_output.var = "[{}].{}".format(node_index, node_output.var) # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb input_path = _os.path.join(node.id, node_index, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) else: node = sub_task_node.assign_id_and_return(unique_node_id) tasks.append(sub_task_node.executable_sdk_object) nodes.append(node) for k, node_output in _six.iteritems(sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) # Upload inputs to working directory under /array_job.input_ref/inputs.pb input_path = _os.path.join(node.id, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) # assign custom field to the ArrayJob properties computed. for task, (array_job, _) in _six.iteritems(array_job_index): # TODO: Reconstruct task template object instead of modifying an existing one? tasks.append( task.assign_custom_and_return( array_job.to_dict()).assign_type_and_return( _constants.SdkTaskType.CONTAINER_ARRAY_TASK)) # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the # total length of tasks to get an absolute count. nodes.extend([ array_job_node for (_, array_job_node) in array_job_index.values() ]) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[]) return dynamic_job_spec, generated_files
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity]) """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: PromiseOutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Because users declare both inputs and outputs in their functions signatures, merge them together # before calling user code inputs_dict.update(outputs_dict) yielded_sub_tasks = [ sub_task for sub_task in super(SdkDynamicTask, self)._execute_user_code( context, inputs_dict) or [] ] upstream_nodes = list() output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.raw_value, upstream_nodes=upstream_nodes)) for name, b in _six.iteritems(outputs_dict) ] upstream_nodes = set(upstream_nodes) generated_files = {} # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless. array_job_index = {} tasks = set() nodes = [] sub_workflows = set() visited_nodes = set() generated_ids = {} effective_failure_ratio = self._allowed_failure_ratio or 0.0 # TODO: This function needs to be cleaned up. # The reason we chain these two together is because we allow users to not have to explicitly "yield" the # node. As long as the subtask/lp/subwf has an output that's referenced, it'll get picked up. for sub_task_node in _itertools.chain(yielded_sub_tasks, upstream_nodes): if sub_task_node in visited_nodes: continue visited_nodes.add(sub_task_node) executable = sub_task_node.executable_sdk_object # If the executable object that we're dealing with is registerable (ie, SdkRunnableLaunchPlan, SdkWorkflow # SdkTask, or SdkRunnableTask), then it should have the ability to give itself a name. After assigning # itself the name, also make sure the id is properly set according to current config values. if isinstance(executable, _registerable.RegisterableEntity): executable.auto_assign_name() executable._id = _identifier.Identifier( executable.resource_type, _internal_config.TASK_PROJECT.get() or _internal_config.PROJECT.get(), _internal_config.TASK_DOMAIN.get() or _internal_config.DOMAIN.get(), executable.platform_valid_name, _internal_config.TASK_VERSION.get() or _internal_config.VERSION.get()) # Generate an id that's unique in the document (if the same task is used multiple times with # different resources, executable_sdk_object.id will be the same but generated node_ids should not # be. safe_task_id = _six.text_type( sub_task_node.executable_sdk_object.id) if safe_task_id in generated_ids: new_count = generated_ids[ safe_task_id] = generated_ids[safe_task_id] + 1 else: new_count = generated_ids[safe_task_id] = 0 unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count)) # Handling case where the yielded node is launch plan if isinstance(sub_task_node.executable_sdk_object, _launch_plan.SdkLaunchPlan): node = sub_task_node.assign_id_and_return(unique_node_id) _append_node(generated_files, node, nodes, sub_task_node) # Handling case where the yielded node is launching a sub-workflow elif isinstance(sub_task_node.executable_sdk_object, _workflow.SdkWorkflow): node = sub_task_node.assign_id_and_return(unique_node_id) _append_node(generated_files, node, nodes, sub_task_node) # Add the workflow itself to the yielded sub-workflows sub_workflows.add(sub_task_node.executable_sdk_object) # Recursively discover statically defined upstream entities (tasks, wfs) SdkDynamicTask._add_upstream_entities( sub_task_node.executable_sdk_object, sub_workflows, tasks) # Handling tasks else: # If the task can run as an array job, group its instances together. Otherwise, keep each # invocation as a separate node. if SdkDynamicTask._can_run_as_array( sub_task_node.executable_sdk_object.type): if sub_task_node.executable_sdk_object in array_job_index: array_job, node = array_job_index[ sub_task_node.executable_sdk_object] array_job.size += 1 array_job.min_successes = int( math.ceil((1 - effective_failure_ratio) * array_job.size)) else: array_job = self._create_array_job( inputs_prefix=unique_node_id) node = sub_task_node.assign_id_and_return( unique_node_id) array_job_index[ sub_task_node.executable_sdk_object] = (array_job, node) node_index = _six.text_type(array_job.size - 1) for k, node_output in _six.iteritems( sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) node_output.var = "[{}].{}".format( node_index, node_output.var) # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb input_path = _os.path.join(node.id, node_index, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) else: node = sub_task_node.assign_id_and_return(unique_node_id) tasks.add(sub_task_node.executable_sdk_object) _append_node(generated_files, node, nodes, sub_task_node) # assign custom field to the ArrayJob properties computed. for task, (array_job, _) in _six.iteritems(array_job_index): # TODO: Reconstruct task template object instead of modifying an existing one? tasks.add( task.assign_custom_and_return( array_job.to_dict()).assign_type_and_return( _constants.SdkTaskType.CONTAINER_ARRAY_TASK)) # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the # total length of tasks to get an absolute count. nodes.extend([ array_job_node for (_, array_job_node) in array_job_index.values() ]) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=list(tasks), nodes=nodes, outputs=output_bindings, subworkflows=list(sub_workflows)) return dynamic_job_spec, generated_files
def get_serializable_node( entity_mapping: OrderedDict, settings: SerializationSettings, entity: Node, ) -> workflow_model.Node: if entity.flyte_entity is None: raise Exception(f"Node {entity.id} has no flyte entity") upstream_sdk_nodes = [ get_serializable(entity_mapping, settings, n) for n in entity.upstream_nodes if n.id != _common_constants.GLOBAL_INPUT_NODE_ID ] # Reference entities also inherit from the classes in the second if statement so address them first. if isinstance(entity.flyte_entity, ReferenceEntity): # This is a throw away call. # See the comment in compile_into_workflow in python_function_task. This is just used to place a None value # in the entity_mapping. get_serializable(entity_mapping, settings, entity.flyte_entity) ref = entity.flyte_entity node_model = workflow_model.Node( id=_dnsify(entity.id), metadata=entity.metadata, inputs=entity.bindings, upstream_node_ids=[n.id for n in upstream_sdk_nodes], output_aliases=[], ) if ref.reference.resource_type == _identifier_model.ResourceType.TASK: node_model._task_node = workflow_model.TaskNode(reference_id=ref.id) elif ref.reference.resource_type == _identifier_model.ResourceType.WORKFLOW: node_model._workflow_node = workflow_model.WorkflowNode(sub_workflow_ref=ref.id) elif ref.reference.resource_type == _identifier_model.ResourceType.LAUNCH_PLAN: node_model._workflow_node = workflow_model.WorkflowNode(launchplan_ref=ref.id) else: raise Exception(f"Unexpected reference type {ref}") return node_model if isinstance(entity.flyte_entity, PythonTask): task_spec = get_serializable(entity_mapping, settings, entity.flyte_entity) node_model = workflow_model.Node( id=_dnsify(entity.id), metadata=entity.metadata, inputs=entity.bindings, upstream_node_ids=[n.id for n in upstream_sdk_nodes], output_aliases=[], task_node=workflow_model.TaskNode( reference_id=task_spec.template.id, overrides=TaskNodeOverrides(resources=entity._resources) ), ) if entity._aliases: node_model._output_aliases = entity._aliases elif isinstance(entity.flyte_entity, WorkflowBase): wf_spec = get_serializable(entity_mapping, settings, entity.flyte_entity) node_model = workflow_model.Node( id=_dnsify(entity.id), metadata=entity.metadata, inputs=entity.bindings, upstream_node_ids=[n.id for n in upstream_sdk_nodes], output_aliases=[], workflow_node=workflow_model.WorkflowNode(sub_workflow_ref=wf_spec.template.id), ) elif isinstance(entity.flyte_entity, BranchNode): node_model = workflow_model.Node( id=_dnsify(entity.id), metadata=entity.metadata, inputs=entity.bindings, upstream_node_ids=[n.id for n in upstream_sdk_nodes], output_aliases=[], branch_node=get_serializable(entity_mapping, settings, entity.flyte_entity), ) elif isinstance(entity.flyte_entity, LaunchPlan): lp_spec = get_serializable(entity_mapping, settings, entity.flyte_entity) node_model = workflow_model.Node( id=_dnsify(entity.id), metadata=entity.metadata, inputs=entity.bindings, upstream_node_ids=[n.id for n in upstream_sdk_nodes], output_aliases=[], workflow_node=workflow_model.WorkflowNode(launchplan_ref=lp_spec.id), ) else: raise Exception(f"Node contained non-serializable entity {entity._flyte_entity}") return node_model