def run(self, pipeline: tfx_pipeline.Pipeline) -> None:
        """Deploys given logical pipeline on Kubernetes.

    Args:
      pipeline: Logical pipeline containing pipeline args and components.
    """
        if not pipeline.pipeline_info.run_id:
            pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat()

        if not kube_utils.is_inside_cluster():
            kubernetes_remote_runner.run_as_kubernetes_job(
                pipeline=pipeline, tfx_image=self._config.tfx_image)
            return
        # TODO(ericlege): Support running components in parallel.
        ran_components = set()

        # Runs component in topological order.
        for component in pipeline.components:
            # Verify that components are in topological order.
            if hasattr(component,
                       'upstream_nodes') and component.upstream_nodes:
                for upstream_node in component.upstream_nodes:
                    assert upstream_node in ran_components, (
                        'Components is not in '
                        'topological order')

            (component_launcher_class,
             component_config) = config_utils.find_component_launch_info(
                 self._config, component)

            # Check if the component is launchable as a container component.
            if kubernetes_component_launcher.KubernetesComponentLauncher.can_launch(
                    component.executor_spec, component_config):
                launch_container_component(component, component_launcher_class,
                                           component_config, pipeline)
            # Otherwise, the component should be launchable with the in process
            # component launcher. wrap the component to a container component.
            elif in_process_component_launcher.InProcessComponentLauncher.can_launch(
                    component.executor_spec, component_config):
                wrapped_component = self._wrap_container_component(
                    component=component,
                    component_launcher_class=component_launcher_class,
                    component_config=component_config,
                    pipeline=pipeline)

                # Component launch info is updated by wrapping the component into a
                # container component. Therefore, these properties need to be reloaded.
                (wrapped_component_launcher_class, wrapped_component_config
                 ) = config_utils.find_component_launch_info(
                     self._config, wrapped_component)

                launch_container_component(wrapped_component,
                                           wrapped_component_launcher_class,
                                           wrapped_component_config, pipeline)
            else:
                raise ValueError(
                    'Can not find suitable launcher for component.')

            ran_components.add(component)
Exemple #2
0
    def testFindComponentLaunchInfoFailWithNoLauncherClassFound(self):
        input_artifact = test_utils._InputArtifact()
        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]))
        p_config = pipeline_config.PipelineConfig(supported_launcher_classes=[
            docker_component_launcher.DockerComponentLauncher
        ])

        with self.assertRaises(RuntimeError):
            # DockerComponentLauncher cannot launch class executor.
            config_utils.find_component_launch_info(p_config, component)
Exemple #3
0
    def testFindComponentLaunchInfoReturnConfigOverride(self):
        input_artifact = test_utils._InputArtifact()
        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]),
            custom_executor_spec=executor_spec.ExecutorContainerSpec(
                image='gcr://test', args=['{{input_dict["input"][0].uri}}']))
        default_config = docker_component_config.DockerComponentConfig()
        override_config = docker_component_config.DockerComponentConfig(
            name='test')
        p_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                docker_component_launcher.DockerComponentLauncher
            ],
            default_component_configs=[default_config],
            component_config_overrides={
                '_FakeComponent.FakeComponent': override_config
            })

        (launcher_class, c_config) = config_utils.find_component_launch_info(
            p_config, component)

        self.assertEqual(docker_component_launcher.DockerComponentLauncher,
                         launcher_class)
        self.assertEqual(override_config, c_config)
Exemple #4
0
    def run(self, tfx_pipeline: pipeline.Pipeline) -> None:

        with beam.Pipeline(argv=self._beam_orchestrator_args) as p:
            root = p | 'CreateRoot' >> beam.Create([None])

            signal_map = {}
            for component in tfx_pipeline.components:
                component_id = component.id

                signals_to_wait = []
                if component.upstream_nodes:
                    for upstream_node in component.upstream_nodes:
                        assert upstream_node in signal_map, (
                            'Components is not in '
                            'topological order')
                        signals_to_wait.append(signal_map[upstream_node])
                logger.debug('Component %s depends on %s.', component_id,
                             [s.producer.full_label for s in signals_to_wait])

                (component_launcher_class,
                 component_config) = \
                    config_utils.find_component_launch_info(
                        self._config, component)

                signal_map[component] = (
                    root
                    | 'Run[%s]' % component_id >> beam.ParDo(
                        _ComponentAsDoFn(component, component_launcher_class,
                                         component_config, tfx_pipeline), *
                        [beam.pvalue.AsIter(s) for s in signals_to_wait]))
                logger.debug('Component %s is scheduled.', component_id)
Exemple #5
0
    def run(self, tfx_pipeline: pipeline.Pipeline) -> None:
        """Deploys given logical pipeline on Beam.

    Args:
      tfx_pipeline: Logical pipeline containing pipeline args and components.
    """
        # For CLI, while creating or updating pipeline, pipeline_args are extracted
        # and hence we avoid executing the pipeline.
        if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
            return

        tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat()

        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}):
            with beam.Pipeline(argv=self._beam_orchestrator_args) as p:
                # Uses for triggering the component DoFns.
                root = p | 'CreateRoot' >> beam.Create([None])

                # Stores mapping of component to its signal.
                signal_map = {}
                # pipeline.components are in topological order.
                for component in tfx_pipeline.components:
                    # TODO(b/187122662): Pass through pip dependencies as a first-class
                    # component flag.
                    if isinstance(component, base_component.BaseComponent):
                        component._resolve_pip_dependencies(  # pylint: disable=protected-access
                            tfx_pipeline.pipeline_info.pipeline_root)
                    component_id = component.id

                    # Signals from upstream components.
                    signals_to_wait = []
                    if component.upstream_nodes:
                        for upstream_node in component.upstream_nodes:
                            assert upstream_node in signal_map, (
                                'Components is not in '
                                'topological order')
                            signals_to_wait.append(signal_map[upstream_node])
                    absl.logging.info(
                        'Component %s depends on %s.', component_id,
                        [s.producer.full_label for s in signals_to_wait])

                    (component_launcher_class, component_config
                     ) = config_utils.find_component_launch_info(
                         self._config, component)

                    # Each signal is an empty PCollection. AsIter ensures component will
                    # be triggered after upstream components are finished.
                    signal_map[component] = (
                        root
                        | 'Run[%s]' % component_id >> beam.ParDo(
                            _ComponentAsDoFn(component,
                                             component_launcher_class,
                                             component_config, tfx_pipeline), *
                            [beam.pvalue.AsIter(s) for s in signals_to_wait]))
                    absl.logging.info('Component %s is scheduled.',
                                      component_id)
Exemple #6
0
    def run(self, tfx_pipeline: pipeline.Pipeline):
        """Deploys given logical pipeline on Airflow.

    Args:
      tfx_pipeline: Logical pipeline containing pipeline args and components.

    Returns:
      An Airflow DAG.
    """

        # Merge airflow-specific configs with pipeline args
        airflow_dag = models.DAG(
            dag_id=tfx_pipeline.pipeline_info.pipeline_name,
            **(typing.cast(AirflowPipelineConfig,
                           self._config).airflow_dag_config))
        if 'tmp_dir' not in tfx_pipeline.additional_pipeline_args:
            tmp_dir = os.path.join(tfx_pipeline.pipeline_info.pipeline_root,
                                   '.temp', '')
            tfx_pipeline.additional_pipeline_args['tmp_dir'] = tmp_dir

        component_impl_map = {}
        for tfx_component in tfx_pipeline.components:
            # TODO(b/187122662): Pass through pip dependencies as a first-class
            # component flag.
            if isinstance(tfx_component, base_component.BaseComponent):
                tfx_component._resolve_pip_dependencies(  # pylint: disable=protected-access
                    tfx_pipeline.pipeline_info.pipeline_root)

            tfx_component = self._replace_runtime_params(tfx_component)

            (component_launcher_class,
             component_config) = config_utils.find_component_launch_info(
                 self._config, tfx_component)
            current_airflow_component = airflow_component.AirflowComponent(
                parent_dag=airflow_dag,
                component=tfx_component,
                component_launcher_class=component_launcher_class,
                pipeline_info=tfx_pipeline.pipeline_info,
                enable_cache=tfx_pipeline.enable_cache,
                metadata_connection_config=tfx_pipeline.
                metadata_connection_config,
                beam_pipeline_args=tfx_pipeline.beam_pipeline_args,
                additional_pipeline_args=tfx_pipeline.additional_pipeline_args,
                component_config=component_config)
            component_impl_map[tfx_component] = current_airflow_component
            for upstream_node in tfx_component.upstream_nodes:
                assert upstream_node in component_impl_map, (
                    'Components is not in '
                    'topological order')
                current_airflow_component.set_upstream(
                    component_impl_map[upstream_node])

        return airflow_dag
Exemple #7
0
  def testFindComponentLaunchInfoReturnDefaultLaunchInfo(self):
    input_artifact = types.Artifact(type_name='InputPath')
    component = test_utils._FakeComponent(
        name='FakeComponent',
        input_channel=channel_utils.as_channel([input_artifact]))
    p_config = pipeline_config.PipelineConfig()

    (launcher_class,
     c_config) = config_utils.find_component_launch_info(p_config, component)

    self.assertEqual(in_process_component_launcher.InProcessComponentLauncher,
                     launcher_class)
    self.assertIsNone(c_config)
    def run(self, tfx_pipeline: pipeline.Pipeline) -> None:
        """Deploys given logical pipeline on Beam.

    Args:
      tfx_pipeline: Logical pipeline containing pipeline args and components.
    """
        # For CLI, while creating or updating pipeline, pipeline_args are extracted
        # and hence we avoid deploying the pipeline.
        if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
            return

        tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat()

        with beam.Pipeline(argv=self._beam_orchestrator_args) as p:
            # Uses for triggering the component DoFns.
            root = p | 'CreateRoot' >> beam.Create([None])

            # Stores mapping of component to its signal.
            signal_map = {}
            # pipeline.components are in topological order.
            for component in tfx_pipeline.components:
                component_id = component.id

                # Signals from upstream components.
                signals_to_wait = []
                if component.upstream_nodes:
                    for upstream_node in component.upstream_nodes:
                        assert upstream_node in signal_map, (
                            'Components is not in '
                            'topological order')
                        signals_to_wait.append(signal_map[upstream_node])
                absl.logging.info(
                    'Component %s depends on %s.', component_id,
                    [s.producer.full_label for s in signals_to_wait])

                (component_launcher_class,
                 component_config) = config_utils.find_component_launch_info(
                     self._config, component)

                # Each signal is an empty PCollection. AsIter ensures component will be
                # triggered after upstream components are finished.
                signal_map[component] = (
                    root
                    | 'Run[%s]' % component_id >> beam.ParDo(
                        _ComponentAsDoFn(component, component_launcher_class,
                                         component_config, tfx_pipeline), *
                        [beam.pvalue.AsIter(s) for s in signals_to_wait]))
                absl.logging.info('Component %s is scheduled.', component_id)
    def run(self, tfx_pipeline: pipeline.Pipeline):
        """Deploys given logical pipeline on Airflow.

    Args:
      tfx_pipeline: Logical pipeline containing pipeline args and components.

    Returns:
      An Airflow DAG.
    """

        # Merge airflow-specific configs with pipeline args
        airflow_dag = models.DAG(
            dag_id=tfx_pipeline.pipeline_info.pipeline_name,
            **self._config.airflow_dag_config)
        if 'tmp_dir' not in tfx_pipeline.additional_pipeline_args:
            tmp_dir = os.path.join(tfx_pipeline.pipeline_info.pipeline_root,
                                   '.temp', '')
            tfx_pipeline.additional_pipeline_args['tmp_dir'] = tmp_dir

        component_impl_map = {}
        for tfx_component in tfx_pipeline.components:

            tfx_component = self._replace_runtime_params(tfx_component)

            (component_launcher_class,
             component_config) = config_utils.find_component_launch_info(
                 self._config, tfx_component)
            current_airflow_component = airflow_component.AirflowComponent(
                airflow_dag,
                component=tfx_component,
                component_launcher_class=component_launcher_class,
                pipeline_info=tfx_pipeline.pipeline_info,
                enable_cache=tfx_pipeline.enable_cache,
                metadata_connection_config=tfx_pipeline.
                metadata_connection_config,
                beam_pipeline_args=tfx_pipeline.beam_pipeline_args,
                additional_pipeline_args=tfx_pipeline.additional_pipeline_args,
                component_config=component_config)
            component_impl_map[tfx_component] = current_airflow_component
            for upstream_node in tfx_component.upstream_nodes:
                assert upstream_node in component_impl_map, (
                    'Components is not in '
                    'topological order')
                current_airflow_component.set_upstream(
                    component_impl_map[upstream_node])

        return airflow_dag
Exemple #10
0
    def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline,
                                  pipeline_root: dsl.PipelineParam):
        """Constructs a Kubeflow Pipeline graph.

    Args:
      pipeline: The logical TFX pipeline to base the construction on.
      pipeline_root: dsl.PipelineParam representing the pipeline root.
    """
        component_to_kfp_op = {}

        # Assumption: There is a partial ordering of components in the list, i.e.,
        # if component A depends on component B and C, then A appears after B and C
        # in the list.
        for component in pipeline.components:
            # Keep track of the set of upstream dsl.ContainerOps for this component.
            depends_on = set()

            for upstream_component in component.upstream_nodes:
                depends_on.add(component_to_kfp_op[upstream_component])

            (component_launcher_class,
             component_config) = config_utils.find_component_launch_info(
                 self._config, component)

            kfp_component = base_component.BaseComponent(
                component=component,
                component_launcher_class=component_launcher_class,
                depends_on=depends_on,
                pipeline=pipeline,
                pipeline_name=pipeline.pipeline_info.pipeline_name,
                pipeline_root=pipeline_root,
                tfx_image=self._config.tfx_image,
                kubeflow_metadata_config=self._config.kubeflow_metadata_config,
                component_config=component_config)

            for operator in self._config.pipeline_operator_funcs:
                kfp_component.container_op.apply(operator)

            kfp_component.container_op.add_pod_label(SDK_ENV_LABEL,
                                                     self._sdk_env)
            assert self._pipeline_id, 'Failed to generate pipeline ID.'
            kfp_component.container_op.add_pod_label(PIPELINE_UUID_LABEL,
                                                     self._pipeline_id)

            component_to_kfp_op[component] = kfp_component.container_op
Exemple #11
0
  def run(self, tfx_pipeline: pipeline.Pipeline) -> None:
    """Runs given logical pipeline locally.

    Args:
      tfx_pipeline: Logical pipeline containing pipeline args and components.
    """
    # For CLI, while creating or updating pipeline, pipeline_args are extracted
    # and hence we avoid executing the pipeline.
    if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
      return

    tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat()

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'local'}):
      # Run each component. Note that the pipeline.components list is in
      # topological order.
      #
      # TODO(b/171319478): After IR-based execution is used, used multi-threaded
      # execution so that independent components can be run in parallel.
      for component in tfx_pipeline.components:
        # TODO(b/187122662): Pass through pip dependencies as a first-class
        # component flag.
        if isinstance(component, base_component.BaseComponent):
          component._resolve_pip_dependencies(  # pylint: disable=protected-access
              tfx_pipeline.pipeline_info.pipeline_root)
        (component_launcher_class, component_config) = (
            config_utils.find_component_launch_info(self._config, component))
        driver_args = data_types.DriverArgs(
            enable_cache=tfx_pipeline.enable_cache)
        metadata_connection = metadata.Metadata(
            tfx_pipeline.metadata_connection_config)
        node_launcher = component_launcher_class.create(
            component=component,
            pipeline_info=tfx_pipeline.pipeline_info,
            driver_args=driver_args,
            metadata_connection=metadata_connection,
            beam_pipeline_args=tfx_pipeline.beam_pipeline_args,
            additional_pipeline_args=tfx_pipeline.additional_pipeline_args,
            component_config=component_config)
        logging.info('Component %s is running.', component.id)
        node_launcher.launch()
        logging.info('Component %s is finished.', component.id)
 def run(self, tfx_pipeline: pipeline.Pipeline) -> None:
     for component in tfx_pipeline.components:
         (component_launcher_class,
          component_config) = (config_utils.find_component_launch_info(
              self._config, component))
         driver_args = data_types.DriverArgs(
             enable_cache=tfx_pipeline.enable_cache)
         metadata_connection = metadata.Metadata(
             tfx_pipeline.metadata_connection_config)
         component_launcher = component_launcher_class.create(
             component=component,
             pipeline_info=tfx_pipeline.pipeline_info,
             driver_args=driver_args,
             metadata_connection=metadata_connection,
             beam_pipeline_args=tfx_pipeline.beam_pipeline_args,
             additional_pipeline_args=tfx_pipeline.additional_pipeline_args,
             component_config=component_config)
         logger.info('Component %s is running.', component.id)
         component_launcher.launch()
         logger.info('Component %s is finished.', component.id)