def _generate_task( self, metadata_handler: metadata.Metadata, node: pipeline_pb2.PipelineNode) -> Optional[task_pb2.Task]: """Generates a node execution task. If a node execution is not feasible, `None` is returned. Args: metadata_handler: A handler to access MLMD db. node: The pipeline node for which to generate a task. Returns: Returns a `Task` or `None` if task generation is deemed infeasible. """ if not task_gen_utils.is_feasible_node(node): return None executions = task_gen_utils.get_executions(metadata_handler, node) result = task_gen_utils.generate_task_from_active_execution( self._pipeline, node, executions) if result: return result resolved_info = task_gen_utils.generate_resolved_info( metadata_handler, node) if resolved_info.input_artifacts is None: logging.info( 'Task cannot be generated for node %s since no input artifacts ' 'are resolved.', node.node_info.id) return None # If the latest successful execution had the same resolved input artifacts, # the component should not be triggered, so task is not generated. # TODO(b/170231077): This logic should be handled by the resolver when it's # implemented. Also, currently only the artifact ids of previous execution # are checked to decide if a new execution is warranted but it may also be # necessary to factor in the difference of execution properties. latest_exec = task_gen_utils.get_latest_successful_execution( executions) if latest_exec: artifact_ids_by_event_type = ( execution_lib.get_artifact_ids_by_event_type_for_execution_id( metadata_handler, latest_exec.id)) latest_exec_input_artifact_ids = artifact_ids_by_event_type.get( metadata_store_pb2.Event.INPUT, set()) current_exec_input_artifact_ids = set( a.id for a in itertools.chain( *resolved_info.input_artifacts.values())) if latest_exec_input_artifact_ids == current_exec_input_artifact_ids: return None execution = execution_publish_utils.register_execution( metadata_handler=metadata_handler, execution_type=node.node_info.type, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties) return task_gen_utils.create_task(self._pipeline, node, execution)
def testGetArtifactIdsForExecutionIdGroupedByEventType(self): with metadata.Metadata(connection_config=self._connection_config) as m: # Register an input and output artifacts in MLMD. input_example = standard_artifacts.Examples() input_example.uri = 'example' input_example.type_id = common_utils.register_type_if_not_exist( m, input_example.artifact_type).id output_model = standard_artifacts.Model() output_model.uri = 'model' output_model.type_id = common_utils.register_type_if_not_exist( m, output_model.artifact_type).id [input_example.id, output_model.id] = m.store.put_artifacts( [input_example.mlmd_artifact, output_model.mlmd_artifact]) execution = execution_lib.prepare_execution( m, metadata_store_pb2.ExecutionType(name='my_execution_type'), exec_properties={ 'p1': 1, 'p2': '2' }, state=metadata_store_pb2.Execution.COMPLETE) contexts = self._generate_contexts(m) execution = execution_lib.put_execution( m, execution, contexts, input_artifacts={'example': [input_example]}, output_artifacts={'model': [output_model]}) artifact_ids_by_event_type = ( execution_lib.get_artifact_ids_by_event_type_for_execution_id( m, execution.id)) self.assertDictEqual( { metadata_store_pb2.Event.INPUT: set([input_example.id]), metadata_store_pb2.Event.OUTPUT: set([output_model.id]), }, artifact_ids_by_event_type)
def _generate_task( self, metadata_handler: metadata.Metadata, node: pipeline_pb2.PipelineNode) -> Optional[task_lib.Task]: """Generates a node execution task. If a node execution is not feasible, `None` is returned. Args: metadata_handler: A handler to access MLMD db. node: The pipeline node for which to generate a task. Returns: Returns a `Task` or `None` if task generation is deemed infeasible. """ executions = task_gen_utils.get_executions(metadata_handler, node) result = task_gen_utils.generate_task_from_active_execution( metadata_handler, self._pipeline, node, executions) if result: return result resolved_info = task_gen_utils.generate_resolved_info( metadata_handler, node) if resolved_info.input_artifacts is None or not any( resolved_info.input_artifacts.values()): logging.info( 'Task cannot be generated for node %s since no input artifacts ' 'are resolved.', node.node_info.id) return None # If the latest successful execution had the same resolved input artifacts, # the component should not be triggered, so task is not generated. # TODO(b/170231077): This logic should be handled by the resolver when it's # implemented. Also, currently only the artifact ids of previous execution # are checked to decide if a new execution is warranted but it may also be # necessary to factor in the difference of execution properties. latest_exec = task_gen_utils.get_latest_successful_execution( executions) if latest_exec: artifact_ids_by_event_type = ( execution_lib.get_artifact_ids_by_event_type_for_execution_id( metadata_handler, latest_exec.id)) latest_exec_input_artifact_ids = artifact_ids_by_event_type.get( metadata_store_pb2.Event.INPUT, set()) current_exec_input_artifact_ids = set( a.id for a in itertools.chain( *resolved_info.input_artifacts.values())) if latest_exec_input_artifact_ids == current_exec_input_artifact_ids: return None node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node) execution = execution_publish_utils.register_execution( metadata_handler=metadata_handler, execution_type=node.node_info.type, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties) outputs_resolver = outputs_utils.OutputsResolver( node, self._pipeline.pipeline_info, self._pipeline.runtime_spec, self._pipeline.execution_mode) # For mixed service nodes, we ensure node services and check service # status; the node is aborted if its service jobs have failed. service_status = self._ensure_node_services_if_mixed(node.node_info.id) if service_status is not None: if service_status != service_jobs.ServiceStatus.RUNNING: return self._abort_node_task(node_uid) return task_lib.ExecNodeTask( node_uid=node_uid, execution=execution, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties, output_artifacts=outputs_resolver.generate_output_artifacts( execution.id), executor_output_uri=outputs_resolver.get_executor_output_uri( execution.id), stateful_working_dir=outputs_resolver. get_stateful_working_directory(execution.id), pipeline=self._pipeline)
def _generate_tasks_for_node( self, metadata_handler: metadata.Metadata, node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]: """Generates a node execution task. If a node execution is not feasible, `None` is returned. Args: metadata_handler: A handler to access MLMD db. node: The pipeline node for which to generate a task. Returns: Returns a `Task` or `None` if task generation is deemed infeasible. """ result = [] node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node) executions = task_gen_utils.get_executions(metadata_handler, node) exec_node_task = task_gen_utils.generate_task_from_active_execution( metadata_handler, self._pipeline, node, executions) if exec_node_task: result.append( task_lib.UpdateNodeStateTask(node_uid=node_uid, state=pstate.NodeState.RUNNING)) result.append(exec_node_task) return result resolved_info = task_gen_utils.generate_resolved_info( metadata_handler, node) # TODO(b/207038460): Update async pipeline to support ForEach. if (resolved_info is None or not resolved_info.input_artifacts or resolved_info.input_artifacts[0] is None or not any(resolved_info.input_artifacts[0].values())): logging.info( 'Task cannot be generated for node %s since no input artifacts ' 'are resolved.', node.node_info.id) return result input_artifact = resolved_info.input_artifacts[0] executor_spec_fingerprint = hashlib.sha256() executor_spec = task_gen_utils.get_executor_spec( self._pipeline_state.pipeline, node.node_info.id) if executor_spec is not None: executor_spec_fingerprint.update( executor_spec.SerializeToString(deterministic=True)) resolved_info.exec_properties[ constants. EXECUTOR_SPEC_FINGERPRINT_KEY] = executor_spec_fingerprint.hexdigest( ) # If the latest execution had the same resolved input artifacts, execution # properties and executor specs, we should not trigger a new execution. latest_exec = task_gen_utils.get_latest_execution(executions) if latest_exec: artifact_ids_by_event_type = ( execution_lib.get_artifact_ids_by_event_type_for_execution_id( metadata_handler, latest_exec.id)) latest_exec_input_artifact_ids = artifact_ids_by_event_type.get( metadata_store_pb2.Event.INPUT, set()) current_exec_input_artifact_ids = set( a.id for a in itertools.chain(*input_artifact.values())) latest_exec_properties = task_gen_utils.extract_properties( latest_exec) current_exec_properties = resolved_info.exec_properties latest_exec_executor_spec_fp = latest_exec_properties[ constants.EXECUTOR_SPEC_FINGERPRINT_KEY] current_exec_executor_spec_fp = resolved_info.exec_properties[ constants.EXECUTOR_SPEC_FINGERPRINT_KEY] if (latest_exec_input_artifact_ids == current_exec_input_artifact_ids and _exec_properties_match(latest_exec_properties, current_exec_properties) and latest_exec_executor_spec_fp == current_exec_executor_spec_fp): result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.STARTED)) return result execution = execution_publish_utils.register_execution( metadata_handler=metadata_handler, execution_type=node.node_info.type, contexts=resolved_info.contexts, input_artifacts=input_artifact, exec_properties=resolved_info.exec_properties) outputs_resolver = outputs_utils.OutputsResolver( node, self._pipeline.pipeline_info, self._pipeline.runtime_spec, self._pipeline.execution_mode) # For mixed service nodes, we ensure node services and check service # status; the node is aborted if its service jobs have failed. service_status = self._ensure_node_services_if_mixed(node.node_info.id) if service_status is not None: if service_status != service_jobs.ServiceStatus.RUNNING: error_msg = f'associated service job failed; node uid: {node_uid}' result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.FAILED, status=status_lib.Status(code=status_lib.Code.ABORTED, message=error_msg))) return result output_artifacts = outputs_resolver.generate_output_artifacts( execution.id) outputs_utils.make_output_dirs(output_artifacts) result.append( task_lib.UpdateNodeStateTask(node_uid=node_uid, state=pstate.NodeState.RUNNING)) result.append( task_lib.ExecNodeTask( node_uid=node_uid, execution_id=execution.id, contexts=resolved_info.contexts, input_artifacts=input_artifact, exec_properties=resolved_info.exec_properties, output_artifacts=output_artifacts, executor_output_uri=outputs_resolver.get_executor_output_uri( execution.id), stateful_working_dir=outputs_resolver. get_stateful_working_directory(execution.id), tmp_dir=outputs_resolver.make_tmp_dir(execution.id), pipeline=self._pipeline)) return result