def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: flytekit.models.dynamic_job.DynamicJobSpec """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }, ) outputs_dict = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Add outputs to inputs inputs_dict.update(outputs_dict) nodes = [] tasks = [] # One node per query generated_queries = self._generate_plugin_objects(context, inputs_dict) # Create output bindings always - this has to happen after user code has run output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.value), ) for name, b in _six.iteritems(outputs_dict) ] i = 0 for quboleHiveJob in generated_queries: hive_job_node = _create_hive_job_node("HiveQuery_{}".format(i), quboleHiveJob.to_flyte_idl(), self.metadata) nodes.append(hive_job_node) tasks.append(hive_job_node.executable_sdk_object) i += 1 dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[], ) return dynamic_job_spec
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: flytekit.models.dynamic_job.DynamicJobSpec """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Add outputs to inputs inputs_dict.update(outputs_dict) # Note: Today a hive task corresponds to a dynamic job spec with one node, which contains multiple # queries. We may change this in future. nodes = [] tasks = [] generated_queries = self._generate_hive_queries(context, inputs_dict) # Create output bindings always - this has to happen after user code has run output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.value)) for name, b in _six.iteritems(outputs_dict) ] if len(generated_queries.query_collection.queries) > 0: hive_job_node = _create_hive_job_node( "HiveQueries", generated_queries.to_flyte_idl(), self.metadata) nodes.append(hive_job_node) tasks.append(hive_job_node.executable_sdk_object) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len( nodes ), # At most we only have one node for now, see above comment tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[]) return dynamic_job_spec
def compile_into_workflow( self, ctx: FlyteContext, task_function: Callable, **kwargs ) -> Union[_dynamic_job.DynamicJobSpec, _literal_models.LiteralMap]: with ctx.new_compilation_context(prefix="dynamic"): # TODO: Resolve circular import from flytekit.common.translator import get_serializable workflow_metadata = WorkflowMetadata( on_failure=WorkflowFailurePolicy.FAIL_IMMEDIATELY) defaults = WorkflowMetadataDefaults(interruptible=False) self._wf = Workflow(task_function, metadata=workflow_metadata, default_metadata=defaults) self._wf.compile(**kwargs) wf = self._wf sdk_workflow = get_serializable(ctx.serialization_settings, wf) # If no nodes were produced, let's just return the strict outputs if len(sdk_workflow.nodes) == 0: return _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sdk_workflow._outputs }) # Gather underlying tasks/workflows that get referenced. Launch plans are handled by propeller. tasks = set() sub_workflows = set() for n in sdk_workflow.nodes: self.aggregate(tasks, sub_workflows, n) dj_spec = _dynamic_job.DynamicJobSpec( min_successes=len(sdk_workflow.nodes), tasks=list(tasks), nodes=sdk_workflow.nodes, outputs=sdk_workflow._outputs, subworkflows=list(sub_workflows), ) return dj_spec
def test_future_task_document(task): rs = _literals.RetryStrategy(0) nm = _workflow.NodeMetadata('node-name', _timedelta(minutes=10), rs) n = _workflow.Node(id="id", metadata=nm, inputs=[], upstream_node_ids=[], output_aliases=[], task_node=_workflow.TaskNode(task.id)) n.to_flyte_idl() doc = _dynamic_job.DynamicJobSpec( tasks=[task], nodes=[n], min_successes=1, outputs=[_literals.Binding("var", _literals.BindingData())], subworkflows=[]) assert text_format.MessageToString( doc.to_flyte_idl()) == text_format.MessageToString( _dynamic_job.DynamicJobSpec.from_flyte_idl( doc.to_flyte_idl()).to_flyte_idl())
def compile_into_workflow( self, ctx: FlyteContext, task_function: Callable, **kwargs ) -> Union[_dynamic_job.DynamicJobSpec, _literal_models.LiteralMap]: if not ctx.compilation_state: cs = ctx.new_compilation_state("dynamic") else: cs = ctx.compilation_state.with_params(prefix="dynamic") with FlyteContextManager.with_context(ctx.with_compilation_state(cs)): # TODO: Resolve circular import from flytekit.common.translator import get_serializable workflow_metadata = WorkflowMetadata( on_failure=WorkflowFailurePolicy.FAIL_IMMEDIATELY) defaults = WorkflowMetadataDefaults( interruptible=self.metadata.interruptible if self.metadata. interruptible is not None else False) self._wf = PythonFunctionWorkflow(task_function, metadata=workflow_metadata, default_metadata=defaults) self._wf.compile(**kwargs) wf = self._wf model_entities = OrderedDict() # See comment on reference entity checking a bit down below in this function. # This is the only circular dependency between the translator.py module and the rest of the flytekit # authoring experience. workflow_spec: admin_workflow_models.WorkflowSpec = get_serializable( model_entities, ctx.serialization_settings, wf) # If no nodes were produced, let's just return the strict outputs if len(workflow_spec.template.nodes) == 0: return _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in workflow_spec.template.outputs }) # This is not great. The translator.py module is relied on here (see comment above) to get the tasks and # subworkflow definitions. However we want to ensure that reference tasks and reference sub workflows are # not used. # TODO: Replace None with a class. for value in model_entities.values(): if value is None: raise Exception( "Reference tasks are not allowed in the dynamic - a network call is necessary " "in order to retrieve the structure of the reference task." ) # Gather underlying TaskTemplates that get referenced. Launch plans are handled by propeller. Subworkflows # should already be in the workflow spec. tts = [ v.template for v in model_entities.values() if isinstance(v, task_models.TaskSpec) ] if ctx.serialization_settings.should_fast_serialize(): if (not ctx.execution_state or not ctx.execution_state.additional_context or not ctx.execution_state.additional_context.get( "dynamic_addl_distro")): raise AssertionError( "Compilation for a dynamic workflow called in fast execution mode but no additional code " "distribution could be retrieved") logger.warn( f"ctx.execution_state.additional_context {ctx.execution_state.additional_context}" ) for task_template in tts: sanitized_args = [] for arg in task_template.container.args: if arg == "{{ .remote_package_path }}": sanitized_args.append( ctx.execution_state.additional_context.get( "dynamic_addl_distro")) elif arg == "{{ .dest_dir }}": sanitized_args.append( ctx.execution_state.additional_context.get( "dynamic_dest_dir", ".")) else: sanitized_args.append(arg) del task_template.container.args[:] task_template.container.args.extend(sanitized_args) dj_spec = _dynamic_job.DynamicJobSpec( min_successes=len(workflow_spec.template.nodes), tasks=tts, nodes=workflow_spec.template.nodes, outputs=workflow_spec.template.outputs, subworkflows=workflow_spec.sub_workflows, ) return dj_spec
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity]) """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: PromiseOutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } inputs_dict.update(outputs_dict) yielded_sub_tasks = [ sub_task for sub_task in super(SdkDynamicTask, self)._execute_user_code( context, inputs_dict) or [] ] upstream_nodes = list() output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.raw_value, upstream_nodes=upstream_nodes)) for name, b in _six.iteritems(outputs_dict) ] upstream_nodes = set(upstream_nodes) generated_files = {} # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless. array_job_index = {} tasks = [] nodes = [] visited_nodes = set() generated_ids = {} effective_failure_ratio = self._allowed_failure_ratio or 0.0 for sub_task_node in _itertools.chain(yielded_sub_tasks, upstream_nodes): if sub_task_node in visited_nodes: continue visited_nodes.add(sub_task_node) # Generate an id that's unique in the document (if the same task is used multiple times with # different resources, executable_sdk_object.id will be the same but generated node_ids should not # be. safe_task_id = _six.text_type( sub_task_node.executable_sdk_object.id) if safe_task_id in generated_ids: new_count = generated_ids[ safe_task_id] = generated_ids[safe_task_id] + 1 else: new_count = generated_ids[safe_task_id] = 0 unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count)) # If the task can run as an array job, group its instances together. Otherwise, keep each invocation as a # separate node. if SdkDynamicTask._can_run_as_array( sub_task_node.executable_sdk_object.type): if sub_task_node.executable_sdk_object in array_job_index: array_job, node = array_job_index[ sub_task_node.executable_sdk_object] array_job.size += 1 array_job.min_successes = int( math.ceil( (1 - effective_failure_ratio) * array_job.size)) else: array_job = self._create_array_job( inputs_prefix=unique_node_id) node = sub_task_node.assign_id_and_return(unique_node_id) array_job_index[sub_task_node.executable_sdk_object] = ( array_job, node) node_index = _six.text_type(array_job.size - 1) for k, node_output in _six.iteritems(sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) node_output.var = "[{}].{}".format(node_index, node_output.var) # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb input_path = _os.path.join(node.id, node_index, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) else: node = sub_task_node.assign_id_and_return(unique_node_id) tasks.append(sub_task_node.executable_sdk_object) nodes.append(node) for k, node_output in _six.iteritems(sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) # Upload inputs to working directory under /array_job.input_ref/inputs.pb input_path = _os.path.join(node.id, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) # assign custom field to the ArrayJob properties computed. for task, (array_job, _) in _six.iteritems(array_job_index): # TODO: Reconstruct task template object instead of modifying an existing one? tasks.append( task.assign_custom_and_return( array_job.to_dict()).assign_type_and_return( _constants.SdkTaskType.CONTAINER_ARRAY_TASK)) # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the # total length of tasks to get an absolute count. nodes.extend([ array_job_node for (_, array_job_node) in array_job_index.values() ]) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[]) return dynamic_job_spec, generated_files
def compile_into_workflow( self, ctx: FlyteContext, is_fast_execution: bool, task_function: Callable, **kwargs ) -> Union[_dynamic_job.DynamicJobSpec, _literal_models.LiteralMap]: if not ctx.compilation_state: cs = ctx.new_compilation_state("dynamic") else: cs = ctx.compilation_state.with_params(prefix="dynamic") with FlyteContextManager.with_context(ctx.with_compilation_state(cs)): # TODO: Resolve circular import from flytekit.common.translator import get_serializable workflow_metadata = WorkflowMetadata( on_failure=WorkflowFailurePolicy.FAIL_IMMEDIATELY) defaults = WorkflowMetadataDefaults(interruptible=False) self._wf = PythonFunctionWorkflow(task_function, metadata=workflow_metadata, default_metadata=defaults) self._wf.compile(**kwargs) wf = self._wf sdk_workflow = get_serializable(OrderedDict(), ctx.serialization_settings, wf, is_fast_execution) # If no nodes were produced, let's just return the strict outputs if len(sdk_workflow.nodes) == 0: return _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sdk_workflow._outputs }) # Gather underlying tasks/workflows that get referenced. Launch plans are handled by propeller. tasks = set() sub_workflows = set() for n in sdk_workflow.nodes: self.aggregate(tasks, sub_workflows, n) if is_fast_execution: if (not ctx.execution_state or not ctx.execution_state.additional_context or not ctx.execution_state.additional_context.get( "dynamic_addl_distro")): raise AssertionError( "Compilation for a dynamic workflow called in fast execution mode but no additional code " "distribution could be retrieved") logger.warn( f"ctx.execution_state.additional_context {ctx.execution_state.additional_context}" ) sanitized_tasks = set() for task in tasks: sanitized_args = [] for arg in task.container.args: if arg == "{{ .remote_package_path }}": sanitized_args.append( ctx.execution_state.additional_context.get( "dynamic_addl_distro")) elif arg == "{{ .dest_dir }}": sanitized_args.append( ctx.execution_state.additional_context.get( "dynamic_dest_dir", ".")) else: sanitized_args.append(arg) del task.container.args[:] task.container.args.extend(sanitized_args) sanitized_tasks.add(task) tasks = sanitized_tasks dj_spec = _dynamic_job.DynamicJobSpec( min_successes=len(sdk_workflow.nodes), tasks=list(tasks), nodes=sdk_workflow.nodes, outputs=sdk_workflow._outputs, subworkflows=list(sub_workflows), ) return dj_spec
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: (_dynamic_job.DynamicJobSpec, dict[Text, flytekit.models.common.FlyteIdlEntity]) """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: PromiseOutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Because users declare both inputs and outputs in their functions signatures, merge them together # before calling user code inputs_dict.update(outputs_dict) yielded_sub_tasks = [ sub_task for sub_task in super(SdkDynamicTask, self)._execute_user_code( context, inputs_dict) or [] ] upstream_nodes = list() output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.raw_value, upstream_nodes=upstream_nodes)) for name, b in _six.iteritems(outputs_dict) ] upstream_nodes = set(upstream_nodes) generated_files = {} # Keeping future-tasks in original order. We don't use upstream_nodes exclusively because the parent task can # yield sub-tasks that it never uses to produce final outputs but they need to execute nevertheless. array_job_index = {} tasks = set() nodes = [] sub_workflows = set() visited_nodes = set() generated_ids = {} effective_failure_ratio = self._allowed_failure_ratio or 0.0 # TODO: This function needs to be cleaned up. # The reason we chain these two together is because we allow users to not have to explicitly "yield" the # node. As long as the subtask/lp/subwf has an output that's referenced, it'll get picked up. for sub_task_node in _itertools.chain(yielded_sub_tasks, upstream_nodes): if sub_task_node in visited_nodes: continue visited_nodes.add(sub_task_node) executable = sub_task_node.executable_sdk_object # If the executable object that we're dealing with is registerable (ie, SdkRunnableLaunchPlan, SdkWorkflow # SdkTask, or SdkRunnableTask), then it should have the ability to give itself a name. After assigning # itself the name, also make sure the id is properly set according to current config values. if isinstance(executable, _registerable.RegisterableEntity): executable.auto_assign_name() executable._id = _identifier.Identifier( executable.resource_type, _internal_config.TASK_PROJECT.get() or _internal_config.PROJECT.get(), _internal_config.TASK_DOMAIN.get() or _internal_config.DOMAIN.get(), executable.platform_valid_name, _internal_config.TASK_VERSION.get() or _internal_config.VERSION.get()) # Generate an id that's unique in the document (if the same task is used multiple times with # different resources, executable_sdk_object.id will be the same but generated node_ids should not # be. safe_task_id = _six.text_type( sub_task_node.executable_sdk_object.id) if safe_task_id in generated_ids: new_count = generated_ids[ safe_task_id] = generated_ids[safe_task_id] + 1 else: new_count = generated_ids[safe_task_id] = 0 unique_node_id = _dnsify("{}-{}".format(safe_task_id, new_count)) # Handling case where the yielded node is launch plan if isinstance(sub_task_node.executable_sdk_object, _launch_plan.SdkLaunchPlan): node = sub_task_node.assign_id_and_return(unique_node_id) _append_node(generated_files, node, nodes, sub_task_node) # Handling case where the yielded node is launching a sub-workflow elif isinstance(sub_task_node.executable_sdk_object, _workflow.SdkWorkflow): node = sub_task_node.assign_id_and_return(unique_node_id) _append_node(generated_files, node, nodes, sub_task_node) # Add the workflow itself to the yielded sub-workflows sub_workflows.add(sub_task_node.executable_sdk_object) # Recursively discover statically defined upstream entities (tasks, wfs) SdkDynamicTask._add_upstream_entities( sub_task_node.executable_sdk_object, sub_workflows, tasks) # Handling tasks else: # If the task can run as an array job, group its instances together. Otherwise, keep each # invocation as a separate node. if SdkDynamicTask._can_run_as_array( sub_task_node.executable_sdk_object.type): if sub_task_node.executable_sdk_object in array_job_index: array_job, node = array_job_index[ sub_task_node.executable_sdk_object] array_job.size += 1 array_job.min_successes = int( math.ceil((1 - effective_failure_ratio) * array_job.size)) else: array_job = self._create_array_job( inputs_prefix=unique_node_id) node = sub_task_node.assign_id_and_return( unique_node_id) array_job_index[ sub_task_node.executable_sdk_object] = (array_job, node) node_index = _six.text_type(array_job.size - 1) for k, node_output in _six.iteritems( sub_task_node.outputs): if not node_output.sdk_node.id: node_output.sdk_node.assign_id_and_return(node.id) node_output.var = "[{}].{}".format( node_index, node_output.var) # Upload inputs to working directory under /array_job.input_ref/<index>/inputs.pb input_path = _os.path.join(node.id, node_index, _constants.INPUT_FILE_NAME) generated_files[input_path] = _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in sub_task_node.inputs }) else: node = sub_task_node.assign_id_and_return(unique_node_id) tasks.add(sub_task_node.executable_sdk_object) _append_node(generated_files, node, nodes, sub_task_node) # assign custom field to the ArrayJob properties computed. for task, (array_job, _) in _six.iteritems(array_job_index): # TODO: Reconstruct task template object instead of modifying an existing one? tasks.add( task.assign_custom_and_return( array_job.to_dict()).assign_type_and_return( _constants.SdkTaskType.CONTAINER_ARRAY_TASK)) # min_successes is absolute, it's computed as the reverse of allowed_failure_ratio and multiplied by the # total length of tasks to get an absolute count. nodes.extend([ array_job_node for (_, array_job_node) in array_job_index.values() ]) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=list(tasks), nodes=nodes, outputs=output_bindings, subworkflows=list(sub_workflows)) return dynamic_job_spec, generated_files
def compile_into_workflow( self, ctx: FlyteContext, task_function: Callable, **kwargs ) -> Union[_dynamic_job.DynamicJobSpec, _literal_models.LiteralMap]: """ In the case of dynamic workflows, this function will produce a workflow definition at execution time which will then proceed to be executed. """ # TODO: circular import from flytekit.core.task import ReferenceTask if not ctx.compilation_state: cs = ctx.new_compilation_state(prefix="d") else: cs = ctx.compilation_state.with_params(prefix="d") with FlyteContextManager.with_context(ctx.with_compilation_state(cs)): # TODO: Resolve circular import from flytekit.tools.translator import get_serializable workflow_metadata = WorkflowMetadata( on_failure=WorkflowFailurePolicy.FAIL_IMMEDIATELY) defaults = WorkflowMetadataDefaults( interruptible=self.metadata.interruptible if self.metadata. interruptible is not None else False) self._wf = PythonFunctionWorkflow(task_function, metadata=workflow_metadata, default_metadata=defaults) self._wf.compile(**kwargs) wf = self._wf model_entities = OrderedDict() # See comment on reference entity checking a bit down below in this function. # This is the only circular dependency between the translator.py module and the rest of the flytekit # authoring experience. workflow_spec: admin_workflow_models.WorkflowSpec = get_serializable( model_entities, ctx.serialization_settings, wf) # If no nodes were produced, let's just return the strict outputs if len(workflow_spec.template.nodes) == 0: return _literal_models.LiteralMap( literals={ binding.var: binding.binding.to_literal_model() for binding in workflow_spec.template.outputs }) # Gather underlying TaskTemplates that get referenced. tts = [] for entity, model in model_entities.items(): # We only care about gathering tasks here. Launch plans are handled by # propeller. Subworkflows should already be in the workflow spec. if not isinstance(entity, Task) and not isinstance( entity, task_models.TaskTemplate): continue # Handle FlyteTask if isinstance(entity, task_models.TaskTemplate): tts.append(entity) continue # We are currently not supporting reference tasks since these will # require a network call to flyteadmin to populate the TaskTemplate # model if isinstance(entity, ReferenceTask): raise Exception( "Reference tasks are currently unsupported within dynamic tasks" ) if not isinstance(model, task_models.TaskSpec): raise TypeError( f"Unexpected type for serialized form of task. Expected {task_models.TaskSpec}, but got {type(model)}" ) # Store the valid task template so that we can pass it to the # DynamicJobSpec later tts.append(model.template) dj_spec = _dynamic_job.DynamicJobSpec( min_successes=len(workflow_spec.template.nodes), tasks=tts, nodes=workflow_spec.template.nodes, outputs=workflow_spec.template.outputs, subworkflows=workflow_spec.sub_workflows, ) return dj_spec