Example #1
0
    def _build_spec_by_group(
        self,
        pipeline_spec: pipeline_spec_pb2.PipelineSpec,
        deployment_config: pipeline_spec_pb2.PipelineDeploymentConfig,
        group: tasks_group.TasksGroup,
        inputs: Mapping[str, List[Tuple[dsl.PipelineChannel, str]]],
        dependencies: Dict[str, List[_GroupOrTask]],
        rootgroup_name: str,
        task_name_to_parent_groups: Mapping[str, List[_GroupOrTask]],
        group_name_to_parent_groups: Mapping[str, List[tasks_group.TasksGroup]],
        name_to_for_loop_group: Mapping[str, dsl.ParallelFor],
    ) -> None:
        """Generates IR spec given a TasksGroup.

        Args:
            pipeline_spec: The pipeline_spec to update in place.
            deployment_config: The deployment_config to hold all executors. The
                spec is updated in place.
            group: The TasksGroup to generate spec for.
            inputs: The inputs dictionary. The keys are group/task names and the
                values are lists of tuples (channel, producing_task_name).
            dependencies: The group dependencies dictionary. The keys are group
                or task names, and the values are lists of dependent groups or
                tasks.
            rootgroup_name: The name of the group root. Used to determine whether
                the component spec for the current group should be the root dag.
            task_name_to_parent_groups: The dict of task name to parent groups.
                Key is task name. Value is a list of ancestor groups including
                the task itself. The list of a given task is sorted in a way that
                the farthest group is the first and the task itself is the last.
            group_name_to_parent_groups: The dict of group name to parent groups.
                Key is the group name. Value is a list of ancestor groups
                including the group itself. The list of a given group is sorted
                in a way that the farthest group is the first and the group
                itself is the last.
            name_to_for_loop_group: The dict of for loop group name to loop
                group.
        """
        group_component_name = component_utils.sanitize_component_name(
            group.name)

        if group.name == rootgroup_name:
            group_component_spec = pipeline_spec.root
        else:
            group_component_spec = pipeline_spec.components[
                group_component_name]

        task_name_to_task_spec = {}
        task_name_to_component_spec = {}

        # Generate task specs and component specs for the dag.
        subgroups = group.groups + group.tasks
        for subgroup in subgroups:

            subgroup_inputs = inputs.get(subgroup.name, [])
            subgroup_channels = [channel for channel, _ in subgroup_inputs]

            subgroup_component_name = (
                component_utils.sanitize_component_name(subgroup.name))

            tasks_in_current_dag = [
                component_utils.sanitize_task_name(subgroup.name)
                for subgroup in subgroups
            ]
            input_parameters_in_current_dag = [
                input_name for input_name in
                group_component_spec.input_definitions.parameters
            ]
            input_artifacts_in_current_dag = [
                input_name for input_name in
                group_component_spec.input_definitions.artifacts
            ]
            is_parent_component_root = (
                group_component_spec == pipeline_spec.root)

            if isinstance(subgroup, pipeline_task.PipelineTask):

                subgroup_task_spec = builder.build_task_spec_for_task(
                    task=subgroup,
                    parent_component_inputs=group_component_spec
                    .input_definitions,
                    tasks_in_current_dag=tasks_in_current_dag,
                    input_parameters_in_current_dag=input_parameters_in_current_dag,
                    input_artifacts_in_current_dag=input_artifacts_in_current_dag,
                )
                task_name_to_task_spec[subgroup.name] = subgroup_task_spec

                subgroup_component_spec = builder.build_component_spec_for_task(
                    task=subgroup)
                task_name_to_component_spec[
                    subgroup.name] = subgroup_component_spec

                executor_label = subgroup_component_spec.executor_label

                if executor_label not in deployment_config.executors:
                    if subgroup.container_spec is not None:
                        subgroup_container_spec = builder.build_container_spec_for_task(
                            task=subgroup)
                        deployment_config.executors[
                            executor_label].container.CopyFrom(
                                subgroup_container_spec)
                    elif subgroup.importer_spec is not None:
                        subgroup_importer_spec = builder.build_importer_spec_for_task(
                            task=subgroup)
                        deployment_config.executors[
                            executor_label].importer.CopyFrom(
                                subgroup_importer_spec)
            elif isinstance(subgroup, dsl.ParallelFor):

                # "Punch the hole", adding additional inputs (other than loop
                # arguments which will be handled separately) needed by its
                # subgroups or tasks.
                loop_subgroup_channels = []

                for channel in subgroup_channels:
                    # Skip 'withItems' loop arguments if it's from an inner loop.
                    if isinstance(
                            channel,
                        (for_loop.LoopArgument, for_loop.LoopArgumentVariable
                        )) and channel.is_with_items_loop_argument:
                        withitems_loop_arg_found_in_self_or_upstream = False
                        for group_name in group_name_to_parent_groups[
                                subgroup.name][::-1]:
                            if group_name in name_to_for_loop_group:
                                loop_group = name_to_for_loop_group[group_name]
                                if channel.name in loop_group.loop_argument.name:
                                    withitems_loop_arg_found_in_self_or_upstream = True
                                    break
                        if not withitems_loop_arg_found_in_self_or_upstream:
                            continue
                    loop_subgroup_channels.append(channel)

                if subgroup.items_is_pipeline_channel:
                    # This loop_argument is based on a pipeline channel, i.e.,
                    # rather than a static list, it is either the output of
                    # another task or an input as global pipeline parameters.
                    loop_subgroup_channels.append(
                        subgroup.loop_argument.items_or_pipeline_channel)

                loop_subgroup_channels.append(subgroup.loop_argument)

                subgroup_component_spec = builder.build_component_spec_for_group(
                    pipeline_channels=loop_subgroup_channels,
                    is_root_group=False,
                )

                subgroup_task_spec = builder.build_task_spec_for_group(
                    group=subgroup,
                    pipeline_channels=loop_subgroup_channels,
                    tasks_in_current_dag=tasks_in_current_dag,
                    is_parent_component_root=is_parent_component_root,
                )

            elif isinstance(subgroup, dsl.Condition):

                # "Punch the hole", adding inputs needed by its subgroups or
                # tasks.
                condition_subgroup_channels = list(subgroup_channels)
                for operand in [
                        subgroup.condition.left_operand,
                        subgroup.condition.right_operand,
                ]:
                    if isinstance(operand, dsl.PipelineChannel):
                        condition_subgroup_channels.append(operand)

                subgroup_component_spec = builder.build_component_spec_for_group(
                    pipeline_channels=condition_subgroup_channels,
                    is_root_group=False,
                )

                subgroup_task_spec = builder.build_task_spec_for_group(
                    group=subgroup,
                    pipeline_channels=condition_subgroup_channels,
                    tasks_in_current_dag=tasks_in_current_dag,
                    is_parent_component_root=is_parent_component_root,
                )

            elif isinstance(subgroup, dsl.ExitHandler):

                subgroup_component_spec = builder.build_component_spec_for_group(
                    pipeline_channels=subgroup_channels,
                    is_root_group=False,
                )

                subgroup_task_spec = builder.build_task_spec_for_group(
                    group=subgroup,
                    pipeline_channels=subgroup_channels,
                    tasks_in_current_dag=tasks_in_current_dag,
                    is_parent_component_root=is_parent_component_root,
                )

            else:
                raise RuntimeError(
                    f'Unexpected task/group type: Got {subgroup} of type '
                    f'{type(subgroup)}.')

            # Generate dependencies section for this task.
            if dependencies.get(subgroup.name, None):
                group_dependencies = list(dependencies[subgroup.name])
                group_dependencies.sort()
                subgroup_task_spec.dependent_tasks.extend([
                    component_utils.sanitize_task_name(dep)
                    for dep in group_dependencies
                ])

            # Add component spec if not exists
            if subgroup_component_name not in pipeline_spec.components:
                pipeline_spec.components[subgroup_component_name].CopyFrom(
                    subgroup_component_spec)

            # Add task spec
            group_component_spec.dag.tasks[subgroup.name].CopyFrom(
                subgroup_task_spec)

        pipeline_spec.deployment_spec.update(
            json_format.MessageToDict(deployment_config))

        # Surface metrics outputs to the top.
        builder.populate_metrics_in_dag_outputs(
            tasks=group.tasks,
            task_name_to_parent_groups=task_name_to_parent_groups,
            task_name_to_task_spec=task_name_to_task_spec,
            task_name_to_component_spec=task_name_to_component_spec,
            pipeline_spec=pipeline_spec,
        )
def populate_metrics_in_dag_outputs(
    tasks: List[pipeline_task.PipelineTask],
    task_name_to_parent_groups: Mapping[str, List[_GroupOrTask]],
    task_name_to_task_spec: Mapping[str, pipeline_spec_pb2.PipelineTaskSpec],
    task_name_to_component_spec: Mapping[str, pipeline_spec_pb2.ComponentSpec],
    pipeline_spec: pipeline_spec_pb2.PipelineSpec,
) -> None:
    """Populates metrics artifacts in DAG outputs.

    Args:
        tasks: The list of tasks that may produce metrics outputs.
        task_name_to_parent_groups: The dict of task name to parent groups.
            Key is the task's name. Value is a list of ancestor groups including
            the task itself. The list of a given op is sorted in a way that the
            farthest group is the first and the task itself is the last.
        task_name_to_task_spec: The dict of task name to PipelineTaskSpec.
        task_name_to_component_spec: The dict of task name to ComponentSpec.
        pipeline_spec: The pipeline_spec to update in-place.
    """
    for task in tasks:
        task_spec = task_name_to_task_spec[task.name]
        component_spec = task_name_to_component_spec[task.name]

        # Get the tuple of (component_name, task_name) of all its parent groups.
        parent_components_and_tasks = [('_root', '')]
        # skip the op itself and the root group which cannot be retrived via name.
        for group_name in task_name_to_parent_groups[task.name][1:-1]:
            parent_components_and_tasks.append(
                (component_utils.sanitize_component_name(group_name),
                 component_utils.sanitize_task_name(group_name)))
        # Reverse the order to make the farthest group in the end.
        parent_components_and_tasks.reverse()

        for output_name, artifact_spec in \
            component_spec.output_definitions.artifacts.items():

            if artifact_spec.artifact_type.WhichOneof(
                    'kind'
            ) == 'schema_title' and artifact_spec.artifact_type.schema_title in [
                    artifact_types.Metrics.TYPE_NAME,
                    artifact_types.ClassificationMetrics.TYPE_NAME,
            ]:
                unique_output_name = '{}-{}'.format(task.name, output_name)

                sub_task_name = task.name
                sub_task_output = output_name
                for component_name, task_name in parent_components_and_tasks:
                    group_component_spec = (
                        pipeline_spec.root if component_name == '_root' else
                        pipeline_spec.components[component_name])
                    group_component_spec.output_definitions.artifacts[
                        unique_output_name].CopyFrom(artifact_spec)
                    group_component_spec.dag.outputs.artifacts[
                        unique_output_name].artifact_selectors.append(
                            pipeline_spec_pb2.DagOutputsSpec
                            .ArtifactSelectorSpec(
                                producer_subtask=sub_task_name,
                                output_artifact_key=sub_task_output,
                            ))
                    sub_task_name = task_name
                    sub_task_output = unique_output_name
def build_task_spec_for_group(
    group: tasks_group.TasksGroup,
    pipeline_channels: List[pipeline_channel.PipelineChannel],
    tasks_in_current_dag: List[str],
    is_parent_component_root: bool,
) -> pipeline_spec_pb2.PipelineTaskSpec:
    """Builds PipelineTaskSpec for a group.

    Args:
        group: The group to build PipelineTaskSpec for.
        pipeline_channels: The list of pipeline channels referenced by the group.
        tasks_in_current_dag: The list of tasks names for tasks in the same dag.
        is_parent_component_root: Whether the parent component is the pipeline's
            root dag.

    Returns:
        A PipelineTaskSpec object representing the group.
    """
    pipeline_task_spec = pipeline_spec_pb2.PipelineTaskSpec()
    pipeline_task_spec.task_info.name = group.display_name or group.name
    pipeline_task_spec.component_ref.name = (
        component_utils.sanitize_component_name(group.name))

    for channel in pipeline_channels:

        channel_full_name = channel.full_name
        subvar_name = None
        if isinstance(channel, for_loop.LoopArgumentVariable):
            channel_full_name = channel.loop_argument.full_name
            subvar_name = channel.subvar_name

        input_name = _additional_input_name_for_pipeline_channel(channel)

        channel_name = channel.name
        if subvar_name:
            pipeline_task_spec.inputs.parameters[
                input_name].parameter_expression_selector = (
                    'parseJson(string_value)["{}"]'.format(subvar_name))
            if not channel.is_with_items_loop_argument:
                channel_name = channel.items_or_pipeline_channel.name

        if isinstance(channel, pipeline_channel.PipelineArtifactChannel):
            if channel.task_name and channel.task_name in tasks_in_current_dag:
                pipeline_task_spec.inputs.artifacts[
                    input_name].task_output_artifact.producer_task = (
                        component_utils.sanitize_task_name(channel.task_name))
                pipeline_task_spec.inputs.artifacts[
                    input_name].task_output_artifact.output_artifact_key = (
                        channel_name)
            else:
                pipeline_task_spec.inputs.artifacts[
                    input_name].component_input_artifact = (
                        channel_full_name
                        if is_parent_component_root else input_name)
        else:
            # channel is one of PipelineParameterChannel, LoopArgument, or
            # LoopArgumentVariable
            if channel.task_name and channel.task_name in tasks_in_current_dag:
                pipeline_task_spec.inputs.parameters[
                    input_name].task_output_parameter.producer_task = (
                        component_utils.sanitize_task_name(channel.task_name))
                pipeline_task_spec.inputs.parameters[
                    input_name].task_output_parameter.output_parameter_key = (
                        channel_name)
            else:
                pipeline_task_spec.inputs.parameters[
                    input_name].component_input_parameter = (
                        channel_full_name if is_parent_component_root else
                        _additional_input_name_for_pipeline_channel(
                            channel_full_name))

    if isinstance(group, tasks_group.ParallelFor):
        _update_task_spec_for_loop_group(
            group=group,
            pipeline_task_spec=pipeline_task_spec,
        )
    elif isinstance(group, tasks_group.Condition):
        _update_task_spec_for_condition_group(
            group=group,
            pipeline_task_spec=pipeline_task_spec,
        )

    return pipeline_task_spec
def build_task_spec_for_task(
    task: pipeline_task.PipelineTask,
    parent_component_inputs: pipeline_spec_pb2.ComponentInputsSpec,
    tasks_in_current_dag: List[str],
    input_parameters_in_current_dag: List[str],
    input_artifacts_in_current_dag: List[str],
) -> pipeline_spec_pb2.PipelineTaskSpec:
    """Builds PipelineTaskSpec for a pipeline task.

    A task input may reference an output outside its immediate DAG.
    For instance::

        random_num = random_num_op(...)
        with dsl.Condition(random_num.output > 5):
            print_op('%s > 5' % random_num.output)

    In this example, `dsl.Condition` forms a subDAG with one task from `print_op`
    inside the subDAG. The task of `print_op` references output from `random_num`
    task, which is outside the sub-DAG. When compiling to IR, such cross DAG
    reference is disallowed. So we need to "punch a hole" in the sub-DAG to make
    the input available in the subDAG component inputs if it's not already there,
    Next, we can call this method to fix the tasks inside the subDAG to make them
    reference the component inputs instead of directly referencing the original
    producer task.

    Args:
        task: The task to build a PipelineTaskSpec for.
        parent_component_inputs: The task's parent component's input specs.
        tasks_in_current_dag: The list of tasks names for tasks in the same dag.
        input_parameters_in_current_dag: The list of input parameters in the DAG
            component.
        input_artifacts_in_current_dag: The list of input artifacts in the DAG
            component.

    Returns:
        A PipelineTaskSpec object representing the task.
    """
    pipeline_task_spec = pipeline_spec_pb2.PipelineTaskSpec()
    pipeline_task_spec.task_info.name = (
        task.task_spec.display_name or task.name)
    # Use task.name for component_ref.name because we may customize component
    # spec for individual tasks to work around the lack of optional inputs
    # support in IR.
    pipeline_task_spec.component_ref.name = (
        component_utils.sanitize_component_name(task.name))
    pipeline_task_spec.caching_options.enable_cache = (
        task.task_spec.enable_caching)

    for input_name, input_value in task.inputs.items():
        input_type = task.component_spec.inputs[input_name].type

        if isinstance(input_value, pipeline_channel.PipelineArtifactChannel):

            if input_value.task_name:
                # Value is produced by an upstream task.
                if input_value.task_name in tasks_in_current_dag:
                    # Dependent task within the same DAG.
                    pipeline_task_spec.inputs.artifacts[
                        input_name].task_output_artifact.producer_task = (
                            component_utils.sanitize_task_name(
                                input_value.task_name))
                    pipeline_task_spec.inputs.artifacts[
                        input_name].task_output_artifact.output_artifact_key = (
                            input_value.name)
                else:
                    # Dependent task not from the same DAG.
                    component_input_artifact = (
                        _additional_input_name_for_pipeline_channel(input_value)
                    )
                    assert component_input_artifact in parent_component_inputs.artifacts, \
                        'component_input_artifact: {} not found. All inputs: {}'.format(
                            component_input_artifact, parent_component_inputs)
                    pipeline_task_spec.inputs.artifacts[
                        input_name].component_input_artifact = (
                            component_input_artifact)
            else:
                raise RuntimeError(
                    f'Artifacts must be produced by a task. Got {input_value}.')

        elif isinstance(input_value, pipeline_channel.PipelineParameterChannel):

            if input_value.task_name:
                # Value is produced by an upstream task.
                if input_value.task_name in tasks_in_current_dag:
                    # Dependent task within the same DAG.
                    pipeline_task_spec.inputs.parameters[
                        input_name].task_output_parameter.producer_task = (
                            component_utils.sanitize_task_name(
                                input_value.task_name))
                    pipeline_task_spec.inputs.parameters[
                        input_name].task_output_parameter.output_parameter_key = (
                            input_value.name)
                else:
                    # Dependent task not from the same DAG.
                    component_input_parameter = (
                        _additional_input_name_for_pipeline_channel(input_value)
                    )
                    assert component_input_parameter in parent_component_inputs.parameters, \
                        'component_input_parameter: {} not found. All inputs: {}'.format(
                            component_input_parameter, parent_component_inputs)
                    pipeline_task_spec.inputs.parameters[
                        input_name].component_input_parameter = (
                            component_input_parameter)
            else:
                # Value is from pipeline input.
                component_input_parameter = input_value.full_name
                if component_input_parameter not in parent_component_inputs.parameters:
                    component_input_parameter = (
                        _additional_input_name_for_pipeline_channel(input_value)
                    )
                pipeline_task_spec.inputs.parameters[
                    input_name].component_input_parameter = (
                        component_input_parameter)

        elif isinstance(input_value, for_loop.LoopArgument):

            component_input_parameter = (
                _additional_input_name_for_pipeline_channel(input_value))
            assert component_input_parameter in parent_component_inputs.parameters, \
                'component_input_parameter: {} not found. All inputs: {}'.format(
                    component_input_parameter, parent_component_inputs)
            pipeline_task_spec.inputs.parameters[
                input_name].component_input_parameter = (
                    component_input_parameter)

        elif isinstance(input_value, for_loop.LoopArgumentVariable):

            component_input_parameter = (
                _additional_input_name_for_pipeline_channel(
                    input_value.loop_argument))
            assert component_input_parameter in parent_component_inputs.parameters, \
                'component_input_parameter: {} not found. All inputs: {}'.format(
                    component_input_parameter, parent_component_inputs)
            pipeline_task_spec.inputs.parameters[
                input_name].component_input_parameter = (
                    component_input_parameter)
            pipeline_task_spec.inputs.parameters[
                input_name].parameter_expression_selector = (
                    'parseJson(string_value)["{}"]'.format(
                        input_value.subvar_name))

        elif isinstance(input_value, str):

            # Handle extra input due to string concat
            pipeline_channels = (
                pipeline_channel.extract_pipeline_channels_from_any(input_value)
            )
            for channel in pipeline_channels:
                # value contains PipelineChannel placeholders which needs to be
                # replaced. And the input needs to be added to the task spec.

                # Form the name for the compiler injected input, and make sure it
                # doesn't collide with any existing input names.
                additional_input_name = (
                    _additional_input_name_for_pipeline_channel(channel))

                # We don't expect collision to happen because we prefix the name
                # of additional input with 'pipelinechannel--'. But just in case
                # collision did happend, throw a RuntimeError so that we don't
                # get surprise at runtime.
                for existing_input_name, _ in task.inputs.items():
                    if existing_input_name == additional_input_name:
                        raise RuntimeError(
                            'Name collision between existing input name '
                            '{} and compiler injected input name {}'.format(
                                existing_input_name, additional_input_name))

                additional_input_placeholder = (
                    placeholders.input_parameter_placeholder(
                        additional_input_name))
                input_value = input_value.replace(channel.pattern,
                                                  additional_input_placeholder)

                if channel.task_name:
                    # Value is produced by an upstream task.
                    if channel.task_name in tasks_in_current_dag:
                        # Dependent task within the same DAG.
                        pipeline_task_spec.inputs.parameters[
                            additional_input_name].task_output_parameter.producer_task = (
                                component_utils.sanitize_task_name(
                                    channel.task_name))
                        pipeline_task_spec.inputs.parameters[
                            input_name].task_output_parameter.output_parameter_key = (
                                channel.name)
                    else:
                        # Dependent task not from the same DAG.
                        component_input_parameter = (
                            _additional_input_name_for_pipeline_channel(channel)
                        )
                        assert component_input_parameter in parent_component_inputs.parameters, \
                            'component_input_parameter: {} not found. All inputs: {}'.format(
                                component_input_parameter, parent_component_inputs)
                        pipeline_task_spec.inputs.parameters[
                            additional_input_name].component_input_parameter = (
                                component_input_parameter)
                else:
                    # Value is from pipeline input. (or loop?)
                    component_input_parameter = channel.full_name
                    if component_input_parameter not in parent_component_inputs.parameters:
                        component_input_parameter = (
                            _additional_input_name_for_pipeline_channel(channel)
                        )
                    pipeline_task_spec.inputs.parameters[
                        additional_input_name].component_input_parameter = (
                            component_input_parameter)

            pipeline_task_spec.inputs.parameters[
                input_name].runtime_value.constant.string_value = input_value

        elif isinstance(input_value, (str, int, float, bool, dict, list)):

            pipeline_task_spec.inputs.parameters[
                input_name].runtime_value.constant.CopyFrom(
                    _to_protobuf_value(input_value))

        else:
            raise ValueError(
                'Input argument supports only the following types: '
                'str, int, float, bool, dict, and list.'
                f'Got {input_value} of type {type(input_value)}.')

    return pipeline_task_spec
Example #5
0
 def test_sanitize_component_name(self):
     self.assertEqual('comp-my-component',
                      utils.sanitize_component_name('My component'))