Beispiel #1
0
    def test_executor_failure(self):
        # Register a fake task scheduler that returns success but the executor
        # was cancelled.
        self._register_task_scheduler(
            ts.TaskSchedulerResult(
                status=status_lib.Status(code=status_lib.Code.OK),
                output=ts.ExecutorNodeOutput(
                    executor_output=_make_executor_output(
                        self._task,
                        code=status_lib.Code.FAILED_PRECONDITION,
                        msg='foobar error'))))
        task_manager = self._run_task_manager()
        self.assertTrue(task_manager.done())
        self.assertIsNone(task_manager.exception())

        # Check that the task was processed and MLMD execution marked failed.
        self.assertTrue(self._task_queue.is_empty())
        execution = self._get_execution()
        self.assertEqual(metadata_store_pb2.Execution.FAILED,
                         execution.last_known_state)
        self.assertEqual(
            'foobar error',
            data_types_utils.get_metadata_value(execution.custom_properties[
                constants.EXECUTION_ERROR_MSG_KEY]))

        # Check that stateful working dir, tmp_dir and output artifact URI are
        # removed.
        self.assertFalse(os.path.exists(self._task.stateful_working_dir))
        self.assertFalse(os.path.exists(self._task.tmp_dir))
        self.assertFalse(os.path.exists(self._output_artifact_uri))
Beispiel #2
0
def _get_pipeline_from_orchestrator_execution(
        execution: metadata_store_pb2.Execution) -> pipeline_pb2.Pipeline:
    pipeline_ir_b64 = data_types_utils.get_metadata_value(
        execution.properties[_PIPELINE_IR])
    pipeline = pipeline_pb2.Pipeline()
    pipeline.ParseFromString(base64.b64decode(pipeline_ir_b64))
    return pipeline
Beispiel #3
0
def _attach_artifact_properties(spec: pipeline_pb2.OutputSpec.ArtifactSpec,
                                artifact: types.Artifact):
    """Attaches properties of an artifact using ArtifactSpec."""
    for key, value in spec.additional_properties.items():
        if not value.HasField('field_value'):
            raise RuntimeError('Property value is not a field_value for %s' %
                               key)
        setattr(artifact, key,
                data_types_utils.get_metadata_value(value.field_value))

    for key, value in spec.additional_custom_properties.items():
        if not value.HasField('field_value'):
            raise RuntimeError('Property value is not a field_value for %s' %
                               key)
        value_type = value.field_value.WhichOneof('value')
        if value_type == 'int_value':
            artifact.set_int_custom_property(key, value.field_value.int_value)
        elif value_type == 'string_value':
            artifact.set_string_custom_property(key,
                                                value.field_value.string_value)
        elif value_type == 'double_value':
            artifact.set_float_custom_property(key,
                                               value.field_value.double_value)
        else:
            raise RuntimeError(f'Unexpected value_type: {value_type}')
Beispiel #4
0
 def pipeline(self) -> pipeline_pb2.Pipeline:
     if not self._pipeline:
         pipeline_ir_b64 = data_types_utils.get_metadata_value(
             self.execution.properties[_PIPELINE_IR])
         pipeline = pipeline_pb2.Pipeline()
         pipeline.ParseFromString(base64.b64decode(pipeline_ir_b64))
         self._pipeline = pipeline
     return self._pipeline
Beispiel #5
0
def _extract_properties(
    execution: metadata_store_pb2.Execution) -> Dict[Text, types.Property]:
  result = {}
  for key, prop in itertools.chain(execution.properties.items(),
                                   execution.custom_properties.items()):
    value = data_types_utils.get_metadata_value(prop)
    if value is None:
      raise ValueError(f'Unexpected property with empty value; key: {key}')
    result[key] = value
  return result
Beispiel #6
0
 def from_mlmd_value(
         cls,
         value: Optional[metadata_store_pb2.Value] = None
 ) -> 'ManualNodeState':
     if not value:
         return ManualNodeState()
     node_state_json = data_types_utils.get_metadata_value(value)
     if not node_state_json:
         return ManualNodeState()
     return json_utils.loads(node_state_json)
Beispiel #7
0
 def is_node_stop_initiated(self, node_uid: task_lib.NodeUid) -> bool:
     """Returns `True` if stopping has been initiated for the given node."""
     if node_uid.pipeline_uid != self.pipeline_uid:
         raise RuntimeError(
             f'Node given by uid {node_uid} does not belong to pipeline given '
             f'by uid {self.pipeline_uid}')
     property_name = _node_stop_initiated_property(node_uid)
     if property_name in self.execution.custom_properties:
         return data_types_utils.get_metadata_value(
             self.execution.custom_properties[property_name]) == 1
     return False
Beispiel #8
0
    def test_scheduler_failure(self):
        # Register a fake task scheduler that returns a failure status.
        self._register_task_scheduler(
            ts.TaskSchedulerResult(status=status_lib.Status(
                code=status_lib.Code.ABORTED, message='foobar error'),
                                   executor_output=None))
        task_manager = self._run_task_manager()
        self.assertTrue(task_manager.done())
        self.assertIsNone(task_manager.exception())

        # Check that the task was processed and MLMD execution marked failed.
        self.assertTrue(self._task_queue.is_empty())
        execution = self._get_execution()
        self.assertEqual(metadata_store_pb2.Execution.FAILED,
                         execution.last_known_state)
        self.assertEqual(
            'foobar error',
            data_types_utils.get_metadata_value(execution.custom_properties[
                constants.EXECUTION_ERROR_MSG_KEY]))
Beispiel #9
0
def extract_properties(
    execution: metadata_store_pb2.Execution
) -> Dict[str, types.ExecPropertyTypes]:
    """Extracts execution properties from mlmd Execution."""
    result = {}
    for key, prop in itertools.chain(execution.properties.items(),
                                     execution.custom_properties.items()):
        if execution_lib.is_schema_key(key):
            continue

        schema_key = execution_lib.get_schema_key(key)
        schema = None
        if schema_key in execution.custom_properties:
            schema = proto_utils.json_to_proto(
                data_types_utils.get_metadata_value(
                    execution.custom_properties[schema_key]),
                pipeline_pb2.Value.Schema())
        value = data_types_utils.get_parsed_value(prop, schema)

        if value is None:
            raise ValueError(
                f'Unexpected property with empty value; key: {key}')
        result[key] = value
    return result
Beispiel #10
0
def _get_metadata_value(
        value: Optional[metadata_store_pb2.Value]) -> Optional[types.Property]:
    if value is None:
        return None
    return data_types_utils.get_metadata_value(value)
Beispiel #11
0
    def _generate_tasks_for_node(
            self, node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]:
        """Generates list of tasks for the given node."""
        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        node_id = node.node_info.id
        result = []

        node_state = self._node_states_dict[node_uid]
        if node_state.state in (pstate.NodeState.STOPPING,
                                pstate.NodeState.STOPPED):
            logging.info(
                'Ignoring node in state \'%s\' for task generation: %s',
                node_state.state, node_uid)
            return result

        # If this is a pure service node, there is no ExecNodeTask to generate
        # but we ensure node services and check service status.
        service_status = self._ensure_node_services_if_pure(node_id)
        if service_status is not None:
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            elif service_status == service_jobs.ServiceStatus.SUCCESS:
                logging.info('Service node successful: %s', node_uid)
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.COMPLETE))
            elif service_status == service_jobs.ServiceStatus.RUNNING:
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.RUNNING))
            return result

        # If a task for the node is already tracked by the task queue, it need
        # not be considered for generation again but we ensure node services
        # in case of a mixed service node.
        if self._is_task_id_tracked_fn(
                task_lib.exec_node_task_id_from_pipeline_node(
                    self._pipeline, node)):
            service_status = self._ensure_node_services_if_mixed(node_id)
            if service_status == service_jobs.ServiceStatus.FAILED:
                error_msg = f'associated service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        node_executions = task_gen_utils.get_executions(
            self._mlmd_handle, node)
        latest_execution = task_gen_utils.get_latest_execution(node_executions)

        # If the latest execution is successful, we're done.
        if latest_execution and execution_lib.is_execution_successful(
                latest_execution):
            logging.info('Node successful: %s', node_uid)
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.COMPLETE))
            return result

        # If the latest execution failed or cancelled, the pipeline should be
        # aborted if the node is not in state STARTING. For nodes that are
        # in state STARTING, a new execution is created.
        if (latest_execution
                and not execution_lib.is_execution_active(latest_execution)
                and node_state.state != pstate.NodeState.STARTING):
            error_msg_value = latest_execution.custom_properties.get(
                constants.EXECUTION_ERROR_MSG_KEY)
            error_msg = data_types_utils.get_metadata_value(
                error_msg_value) if error_msg_value else ''
            error_msg = f'node failed; node uid: {node_uid}; error: {error_msg}'
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.FAILED,
                                             status=status_lib.Status(
                                                 code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
            return result

        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            self._mlmd_handle, self._pipeline, node, node_executions)
        if exec_node_task:
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.RUNNING))
            result.append(exec_node_task)
            return result

        # Finally, we are ready to generate tasks for the node by resolving inputs.
        result.extend(self._resolve_inputs_and_generate_tasks_for_node(node))
        return result
Beispiel #12
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        successful_node_ids = set()
        for layer_num, layer_nodes in enumerate(layers):
            for node in layer_nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id

                if self._in_successful_nodes_cache(node_uid):
                    successful_node_ids.add(node_id)
                    continue

                if not self._upstream_nodes_successful(node,
                                                       successful_node_ids):
                    continue

                # If this is a pure service node, there is no ExecNodeTask to generate
                # but we ensure node services and check service status.
                service_status = self._ensure_node_services_if_pure(node_id)
                if service_status is not None:
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'service job failed; node uid: {node_uid}')
                        ]
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node successful: %s', node_uid)
                        successful_node_ids.add(node_id)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again but we ensure node services
                # in case of a mixed service node.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    service_status = self._ensure_node_services_if_mixed(
                        node_id)
                    if service_status == service_jobs.ServiceStatus.FAILED:
                        return [
                            self._abort_task(
                                f'associated service job failed; node uid: {node_uid}'
                            )
                        ]
                    continue

                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                latest_execution = task_gen_utils.get_latest_execution(
                    node_executions)

                # If the latest execution is successful, we're done.
                if latest_execution and execution_lib.is_execution_successful(
                        latest_execution):
                    logging.info('Node successful: %s', node_uid)
                    successful_node_ids.add(node_id)
                    continue

                # If the latest execution failed, the pipeline should be aborted.
                if latest_execution and not execution_lib.is_execution_active(
                        latest_execution):
                    error_msg_value = latest_execution.custom_properties.get(
                        constants.EXECUTION_ERROR_MSG_KEY)
                    error_msg = data_types_utils.get_metadata_value(
                        error_msg_value) if error_msg_value else ''
                    return [
                        self._abort_task(
                            f'node failed; node uid: {node_uid}; error: {error_msg}'
                        )
                    ]

                # Finally, we are ready to generate an ExecNodeTask for the node.
                task = self._maybe_generate_task(node, node_executions,
                                                 successful_node_ids)
                if task:
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)

            layer_node_ids = set(node.node_info.id for node in layer_nodes)
            successful_layer_node_ids = layer_node_ids & successful_node_ids
            self._update_successful_nodes_cache(successful_layer_node_ids)

            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if (layer_num == len(layers) - 1
                    and successful_layer_node_ids == layer_node_ids):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
Beispiel #13
0
def _get_pipeline_from_orchestrator_execution(
        execution: metadata_store_pb2.Execution) -> pipeline_pb2.Pipeline:
    pipeline_ir_b64 = data_types_utils.get_metadata_value(
        execution.properties[_PIPELINE_IR])
    return _base64_decode_pipeline(pipeline_ir_b64)
Beispiel #14
0
 def is_stop_initiated(self):
     """Returns `True` if pipeline execution stopping has been initiated."""
     if _STOP_INITIATED in self.execution.custom_properties:
         return data_types_utils.get_metadata_value(
             self.execution.custom_properties[_STOP_INITIATED]) == 1
     return False