Example #1
0
  def run(self, pipeline: pipeline_py.Pipeline) -> None:
    """Runs given logical pipeline locally.

    Args:
      pipeline: Logical pipeline containing pipeline args and components.
    """
    # For CLI, while creating or updating pipeline, pipeline_args are extracted
    # and hence we avoid executing the pipeline.
    if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
      return

    for component in pipeline.components:
      # TODO(b/187122662): Pass through pip dependencies as a first-class
      # component flag.
      if isinstance(component, base_component.BaseComponent):
        component._resolve_pip_dependencies(  # pylint: disable=protected-access
            pipeline.pipeline_info.pipeline_root)

    c = compiler.Compiler()
    pipeline = c.compile(pipeline)

    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        pipeline, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME:
                datetime.datetime.now().isoformat(),
        })

    deployment_config = runner_utils.extract_local_deployment_config(pipeline)
    connection_config = deployment_config.metadata_connection_config

    logging.info('Running pipeline:\n %s', pipeline)
    logging.info('Using deployment config:\n %s', deployment_config)
    logging.info('Using connection config:\n %s', connection_config)

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'local'}):
      # Run each component. Note that the pipeline.components list is in
      # topological order.
      #
      # TODO(b/171319478): After IR-based execution is used, used multi-threaded
      # execution so that independent components can be run in parallel.
      for node in pipeline.nodes:
        pipeline_node = node.pipeline_node
        node_id = pipeline_node.node_info.id
        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata.Metadata(connection_config),
            pipeline_info=pipeline.pipeline_info,
            pipeline_runtime_spec=pipeline.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec)
        logging.info('Component %s is running.', node_id)
        component_launcher.launch()
        logging.info('Component %s is finished.', node_id)
Example #2
0
def create_pipeline() -> pipeline_pb2.Pipeline:
    """Creates an async pipeline for testing."""
    # pylint: disable=no-value-for-parameter
    example_gen = _example_gen().with_id('my_example_gen')
    transform = _transform(examples=example_gen.outputs['examples'],
                           a_param=10).with_id('my_transform')
    trainer = _trainer(
        examples=example_gen.outputs['examples'],
        transform_graph=transform.outputs['transform_graph']).with_id(
            'my_trainer')
    # pylint: enable=no-value-for-parameter

    pipeline = pipeline_lib.Pipeline(
        pipeline_name='my_pipeline',
        pipeline_root='/path/to/root',
        components=[
            example_gen,
            transform,
            trainer,
        ],
        execution_mode=pipeline_lib.ExecutionMode.ASYNC)
    dsl_compiler = compiler.Compiler()
    compiled_pipeline: pipeline_pb2.Pipeline = dsl_compiler.compile(pipeline)

    # Compiler does not support setting min_count yet, so we mutate the proto
    # explicitly for testing.
    trainer = compiled_pipeline.nodes[2].pipeline_node
    assert trainer.node_info.id == 'my_trainer'
    for value in trainer.inputs.inputs.values():
        value.min_count = 1

    return compiled_pipeline
Example #3
0
 def setUp(self):
     super().setUp()
     temp_dir = self.get_temp_dir()
     self.pipeline_root = os.path.join(temp_dir, 'pipeline')
     self.metadata_conn_config = metadata.sqlite_metadata_connection_config(
         os.path.join(temp_dir, 'metadata', 'metadata.db'))
     self.compiler = compiler.Compiler()
Example #4
0
 def testCompile(self, pipeline_module, expected_result_path):
     """Tests compiling the whole pipeline."""
     dsl_compiler = compiler.Compiler()
     compiled_pb = dsl_compiler.compile(
         self._get_test_pipeline_definition(pipeline_module))
     expected_pb = self._get_test_pipeline_pb(expected_result_path)
     self.assertProtoEquals(expected_pb, compiled_pb)
Example #5
0
 def testCompileImporterAdditionalPropertyTypeError(self):
     dsl_compiler = compiler.Compiler()
     test_pipeline = self._get_test_pipeline_definition(iris_pipeline_async)
     impt = next(c for c in test_pipeline.components
                 if compiler_utils.is_importer(c))
     impt.exec_properties[importer.PROPERTIES_KEY]["split_names"] = 2.1
     with self.assertRaisesRegex(TypeError,
                                 "Expected STRING but given DOUBLE"):
         dsl_compiler.compile(test_pipeline)
Example #6
0
def create_pipeline() -> pipeline_pb2.Pipeline:
    """Builds a test pipeline with only manual node."""
    manual = manual_node.ManualNode(description='Do something.')

    pipeline = pipeline_lib.Pipeline(pipeline_name='my_pipeline',
                                     pipeline_root='/path/to/root',
                                     components=[manual],
                                     enable_cache=True)
    dsl_compiler = compiler.Compiler()
    return dsl_compiler.compile(pipeline)
Example #7
0
 def testCompileAdditionalPropertyTypeError(self):
   dsl_compiler = compiler.Compiler()
   test_pipeline = self._get_test_pipeline_definition(
       additional_properties_test_pipeline_async)
   custom_producer = next(
       c for c in test_pipeline.components if isinstance(
           c, additional_properties_test_pipeline_async.CustomProducer))
   custom_producer.outputs["stats"].additional_properties[
       "span"] = "wrong_type"
   with self.assertRaisesRegex(TypeError, "Expected INT but given STRING"):
     dsl_compiler.compile(test_pipeline)
Example #8
0
def create_pipeline() -> pipeline_pb2.Pipeline:
  """Builds a test pipeline."""
  # pylint: disable=no-value-for-parameter
  example_gen = _example_gen().with_id('my_example_gen')
  stats_gen = _statistics_gen(
      examples=example_gen.outputs['examples']).with_id('my_statistics_gen')
  schema_gen = _schema_gen(
      statistics=stats_gen.outputs['statistics']).with_id('my_schema_gen')
  example_validator = _example_validator(
      statistics=stats_gen.outputs['statistics'],
      schema=schema_gen.outputs['schema']).with_id('my_example_validator')
  transform = _transform(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema']).with_id('my_transform')
  trainer = _trainer(
      examples=example_gen.outputs['examples'],
      schema=schema_gen.outputs['schema'],
      transform_graph=transform.outputs['transform_graph']).with_id(
          'my_trainer')

  # Nodes with no input or output specs for testing task only dependencies.
  chore_a = _chore().with_id('chore_a')
  chore_a.add_upstream_node(trainer)
  chore_b = _chore().with_id('chore_b')
  chore_b.add_upstream_node(chore_a)

  with conditional.Cond(
      trainer.outputs['model'].future()[0].custom_property('evaluate') == 1):
    evaluator = _evaluator(
        model=trainer.outputs['model']).with_id('my_evaluator')
  # pylint: enable=no-value-for-parameter

  pipeline = pipeline_lib.Pipeline(
      pipeline_name='my_pipeline',
      pipeline_root='/path/to/root',
      components=[
          example_gen,
          stats_gen,
          schema_gen,
          example_validator,
          transform,
          trainer,
          evaluator,
          chore_a,
          chore_b,
      ],
      enable_cache=True)
  dsl_compiler = compiler.Compiler()
  return dsl_compiler.compile(pipeline)
Example #9
0
def _make_pipeline_proto(
        pipeline: pipeline_py.Pipeline) -> pipeline_pb2.Pipeline:
    """Resolve pip dependencies and compile Pipeline object."""
    if isinstance(pipeline, pipeline_pb2.Pipeline):
        raise ValueError(
            'The "run" method, which is only meant for running Pipeline objects, '
            'was called with a Pipeline IR. Did you mean to call the '
            '"run_with_ir" method instead?')
    for component in pipeline.components:
        # TODO(b/187122662): Pass through pip dependencies as a first-class
        # component flag.
        if isinstance(component, base_component.BaseComponent):
            component._resolve_pip_dependencies(  # pylint: disable=protected-access
                pipeline.pipeline_info.pipeline_root)
    return compiler.Compiler().compile(pipeline)
Example #10
0
  def testPatcher(self, use_pipeline_proto, mock_run):
    patcher = _DummyDagRunnerPatcher(self)
    pipeline = tfx_pipeline.Pipeline(_PIPELINE_NAME, 'dummy_root')
    if use_pipeline_proto:
      pipeline = compiler.Compiler().compile(pipeline)
    runner = _DummyDagRunner()

    with patcher.patch() as context:
      self.assertNotIn('foo', context)
      self.assertFalse(patcher.run_called)
      runner.run(pipeline)
      print(context)
      self.assertEqual(context['foo'], 24)
      self.assertTrue(patcher.run_called)
      mock_run.assert_called_once()
Example #11
0
 def testCompileDynamicExecPropTypeError(self):
   dsl_compiler = compiler.Compiler()
   test_pipeline = self._get_test_pipeline_definition(
       dynamic_exec_properties_pipeline)
   downstream_component = next(
       c for c in test_pipeline.components
       if isinstance(c, dynamic_exec_properties_pipeline.DownstreamComponent))
   instance_a = _MyType()
   instance_b = _MyType()
   test_wrong_type_channel = channel.Channel(_MyType).set_artifacts(
       [instance_a, instance_b]).future()
   downstream_component.exec_properties["input_num"] = test_wrong_type_channel
   with self.assertRaisesRegex(
       ValueError,
       "output channel to dynamic exec properties is not ValueArtifact"):
     dsl_compiler.compile(test_pipeline)
def create_pipeline() -> pipeline_pb2.Pipeline:
    """Creates a pipeline with an importer node for testing."""
    inode = importer.Importer(
        source_uri='my_url',
        reimport=True,
        custom_properties={
            'int_custom_property': 123,
            'str_custom_property': 'abc',
        },
        artifact_type=standard_artifacts.Schema).with_id('my_importer')
    pipeline = pipeline_lib.Pipeline(
        pipeline_name='my_pipeline',
        pipeline_root='/path/to/root',
        components=[inode],
        execution_mode=pipeline_lib.ExecutionMode.SYNC)
    dsl_compiler = compiler.Compiler()
    return dsl_compiler.compile(pipeline)
Example #13
0
  def __init__(
      self,
      output_dir: Optional[Text] = None,
      output_filename: Optional[Text] = None,
      config: Optional[KubeflowDagRunnerConfig] = None,
      pod_labels_to_attach: Optional[Dict[Text, Text]] = None
  ):
    """Initializes KubeflowDagRunner for compiling a Kubeflow Pipeline.

    Args:
      output_dir: An optional output directory into which to output the pipeline
        definition files. Defaults to the current working directory.
      output_filename: An optional output file name for the pipeline definition
        file. Defaults to pipeline_name.tar.gz when compiling a TFX pipeline.
        Currently supports .tar.gz, .tgz, .zip, .yaml, .yml formats. See
        https://github.com/kubeflow/pipelines/blob/181de66cf9fa87bcd0fe9291926790c400140783/sdk/python/kfp/compiler/compiler.py#L851
          for format restriction.
      config: An optional KubeflowDagRunnerConfig object to specify runtime
        configuration when running the pipeline under Kubeflow.
      pod_labels_to_attach: Optional set of pod labels to attach to GKE pod
        spinned up for this pipeline. Default to the 3 labels:
        1. add-pod-env: true,
        2. pipeline SDK type,
        3. pipeline unique ID,
        where 2 and 3 are instrumentation of usage tracking.
    """
    if config and not isinstance(config, KubeflowDagRunnerConfig):
      raise TypeError('config must be type of KubeflowDagRunnerConfig.')
    super(KubeflowDagRunner, self).__init__(config or KubeflowDagRunnerConfig())
    self._config = cast(KubeflowDagRunnerConfig, self._config)
    self._output_dir = output_dir or os.getcwd()
    self._output_filename = output_filename
    self._compiler = compiler.Compiler()
    self._tfx_compiler = tfx_compiler.Compiler()
    self._params = []  # List of dsl.PipelineParam used in this pipeline.
    self._deduped_parameter_names = set()  # Set of unique param names used.
    if pod_labels_to_attach is None:
      self._pod_labels_to_attach = get_default_pod_labels()
    else:
      self._pod_labels_to_attach = pod_labels_to_attach
def create_pipeline() -> pipeline_pb2.Pipeline:
  """Creates a pipeline with a resolver node for testing."""
  trainer = _trainer().with_id('my_trainer')  # pylint: disable=no-value-for-parameter
  rnode = resolver.Resolver(
      strategy_class=latest_artifact_strategy.LatestArtifactStrategy,
      config={
          'desired_num_of_artifacts': 1
      },
      resolved_model=types.Channel(
          type=standard_artifacts.Model)).with_id('my_resolver')
  rnode.add_upstream_node(trainer)
  consumer = _consumer(
      resolved_model=rnode.outputs['resolved_model']).with_id('my_consumer')
  pipeline = pipeline_lib.Pipeline(
      pipeline_name='my_pipeline',
      pipeline_root='/path/to/root',
      components=[
          trainer,
          rnode,
          consumer,
      ],
      execution_mode=pipeline_lib.ExecutionMode.SYNC)
  dsl_compiler = compiler.Compiler()
  return dsl_compiler.compile(pipeline)
Example #15
0
  def run(self, pipeline: Union[pipeline_pb2.Pipeline,
                                pipeline_py.Pipeline]) -> None:
    """Deploys given logical pipeline on Beam.

    Args:
      pipeline: Logical pipeline in IR format.
    """
    # For CLI, while creating or updating pipeline, pipeline_args are extracted
    # and hence we avoid deploying the pipeline.
    if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ:
      return

    if isinstance(pipeline, pipeline_py.Pipeline):
      c = compiler.Compiler()
      pipeline = c.compile(pipeline)

    run_id = datetime.datetime.now().strftime('%Y%m%d-%H%M%S.%f')
    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        pipeline, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id,
        })

    deployment_config = self._extract_deployment_config(pipeline)
    connection_config = self._connection_config_from_deployment_config(
        deployment_config)

    logging.info('Running pipeline:\n %s', pipeline)
    logging.info('Using deployment config:\n %s', deployment_config)
    logging.info('Using connection config:\n %s', connection_config)

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}):
      with beam.Pipeline() as p:
        # Uses for triggering the node DoFns.
        root = p | 'CreateRoot' >> beam.Create([None])

        # Stores mapping of node to its signal.
        signal_map = {}
        # pipeline.nodes are in topological order.
        for node in pipeline.nodes:
          # TODO(b/160882349): Support subpipeline
          pipeline_node = node.pipeline_node
          node_id = pipeline_node.node_info.id
          executor_spec = self._extract_executor_spec(
              deployment_config, node_id)
          custom_driver_spec = self._extract_custom_driver_spec(
              deployment_config, node_id)

          # Signals from upstream nodes.
          signals_to_wait = []
          for upstream_node in pipeline_node.upstream_nodes:
            assert upstream_node in signal_map, ('Nodes are not in '
                                                 'topological order')
            signals_to_wait.append(signal_map[upstream_node])
          logging.info('Node %s depends on %s.', node_id,
                       [s.producer.full_label for s in signals_to_wait])

          # Each signal is an empty PCollection. AsIter ensures a node will
          # be triggered after upstream nodes are finished.
          signal_map[node_id] = (
              root
              | 'Run[%s]' % node_id >> beam.ParDo(
                  self._PIPELINE_NODE_DO_FN_CLS(
                      pipeline_node=pipeline_node,
                      mlmd_connection_config=connection_config,
                      pipeline_info=pipeline.pipeline_info,
                      pipeline_runtime_spec=pipeline.runtime_spec,
                      executor_spec=executor_spec,
                      custom_driver_spec=custom_driver_spec,
                      deployment_config=deployment_config),
                  *[beam.pvalue.AsIter(s) for s in signals_to_wait]))
          logging.info('Node %s is scheduled.', node_id)
Example #16
0
 def testCompile(self):
   """Test compiling the whole pipeline."""
   c = compiler.Compiler()
   compiled_pb = c.compile(self._pipeline)
   self.assertProtoEquals(self._pipeline_pb, compiled_pb)
Example #17
0
def run(benchmarks: List[Benchmark],
        tfx_runner: Optional[tfx_runner_lib.TfxRunner] = None,
        pipeline_name: Optional[str] = None,
        pipeline_root: Optional[str] = None,
        metadata_connection_config: Optional[
            metadata_store_pb2.ConnectionConfig] = None,
        enable_cache: Optional[bool] = False,
        beam_pipeline_args: Optional[List[str]] = None,
        **kwargs) -> BenchmarkPipeline:
    """Runs the given benchmarks as part of a single pipeline DAG.

  First it concatenates all the benchmark pipelines into a single DAG
  benchmark pipeline. Next it executes the workflow via tfx_runner.run().

  When the `match` flag is set, matched benchmarks are filtered by name.

  When the `runs_per_benchmark` flag is set, each benchmark is run the number
  of times specified.


  Args:
    benchmarks: List of Benchmark instances to include in the suite.
    tfx_runner: The TfxRunner instance that defines the platform where
      benchmarks are run.
    pipeline_name: Name of the benchmark pipeline.
    pipeline_root: Path to root directory of the pipeline.
    metadata_connection_config: The config to connect to ML metadata.
    enable_cache: Whether or not cache is enabled for this run.
    beam_pipeline_args: Beam pipeline args for beam jobs within executor.
      Executor will use beam DirectRunner as Default.
    **kwargs: Additional kwargs forwarded as kwargs to benchmarks.

  Returns:
    Returns the BenchmarkPipeline that was passed to the tfx_runner.

  Raises:
    ValueError: If the given tfx_runner is not supported.
  """

    if "compile_pipeline" in kwargs:
        kwargs.pop("compile_pipeline")
        logging.warning(
            "The `compile_pipeline` argument DEPRECATED and ignored. "
            "Pipelines are now automatically compiled.")

    runs_per_benchmark = FLAGS.runs_per_benchmark
    if runs_per_benchmark is None:
        runs_per_benchmark = int(
            os.environ.get("NITROML_RUNS_PER_BENCHMARK", 1))

    if not tfx_runner:
        logging.info("Setting TFX runner to OSS default: BeamDagRunner.")
        tfx_runner = beam_dag_runner.BeamDagRunner()

    if runs_per_benchmark <= 0:
        raise ValueError(
            "runs_per_benchmark must be strictly positive; "
            f"got runs_per_benchmark={runs_per_benchmark} instead.")

    benchmark_subpipelines = []
    for b in benchmarks:
        for benchmark_run in range(runs_per_benchmark):
            # Call benchmarks with pipeline args.
            spec = b(benchmark_run=benchmark_run + 1,
                     runs_per_benchmark=runs_per_benchmark,
                     **kwargs)
            for benchmark_subpipeline in spec.benchmark_subpipelines:
                if re.match(FLAGS.match, benchmark_subpipeline.id):
                    benchmark_subpipelines.append(benchmark_subpipeline)

    if FLAGS.match and not benchmark_subpipelines:
        if spec.components_to_always_add:
            logging.info(
                "No benchmarks matched the pattern '%s'. "
                "Running components passed to self.add(..., always=True) only.",
                FLAGS.match)
        else:
            raise ValueError(
                f"No benchmarks matched the pattern '{FLAGS.match}'")

    benchmark_pipeline = BenchmarkPipeline(
        components_to_always_add=spec.components_to_always_add,
        benchmark_subpipelines=benchmark_subpipelines,
        pipeline_name=pipeline_name,
        pipeline_root=pipeline_root,
        metadata_connection_config=metadata_connection_config,
        enable_cache=enable_cache,
        beam_pipeline_args=beam_pipeline_args,
        **kwargs)

    logging.info("NitroML benchmarks:")
    for benchmark_name in benchmark_pipeline.benchmark_names:
        logging.info("\t%s", benchmark_name)
        logging.info("\t\tRUNNING")
    dsl_compiler = compiler.Compiler()
    pipeline_to_run = dsl_compiler.compile(benchmark_pipeline)
    if spec.requested_partial_run:
        logging.info("Only running the following nodes:\n%s",
                     "\n".join(spec.nodes_to_partial_run))
        pipeline_to_run = pipeline_filtering.filter_pipeline(
            input_pipeline=pipeline_to_run,
            pipeline_run_id_fn=(
                pipeline_filtering.make_latest_resolver_pipeline_run_id_fn(
                    benchmark_pipeline.metadata_connection_config)),
            skip_nodes=lambda x: x not in set(spec.nodes_to_partial_run))

    tfx_runner.run(pipeline_to_run)
    return benchmark_pipeline
Example #18
0
    def run(
        self, pipeline: tfx_pipeline.Pipeline, run_name: Optional[str] = None
    ) -> None:
        """Runs given logical pipeline locally.

        Args:
          pipeline: Logical pipeline containing pipeline args and components.
          run_name: Optional name for the run.
        """
        for component in pipeline.components:
            if isinstance(component, base_component.BaseComponent):
                component._resolve_pip_dependencies(
                    pipeline.pipeline_info.pipeline_root
                )

        c = compiler.Compiler()
        pipeline = c.compile(pipeline)

        run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f")
        # Substitute the runtime parameter to be a concrete run_id
        runtime_parameter_utils.substitute_runtime_parameter(
            pipeline,
            {
                PIPELINE_RUN_ID_PARAMETER_NAME: run_name,
            },
        )

        deployment_config = runner_utils.extract_local_deployment_config(
            pipeline
        )
        connection_config = deployment_config.metadata_connection_config  # type: ignore[attr-defined] # noqa

        logger.debug(f"Using deployment config:\n {deployment_config}")
        logger.debug(f"Using connection config:\n {connection_config}")

        # Run each component. Note that the pipeline.components list is in
        # topological order.
        for node in pipeline.nodes:
            pipeline_node = node.pipeline_node
            node_id = pipeline_node.node_info.id
            executor_spec = runner_utils.extract_executor_spec(
                deployment_config, node_id
            )
            custom_driver_spec = runner_utils.extract_custom_driver_spec(
                deployment_config, node_id
            )

            component_launcher = launcher.Launcher(
                pipeline_node=pipeline_node,
                mlmd_connection=metadata.Metadata(connection_config),
                pipeline_info=pipeline.pipeline_info,
                pipeline_runtime_spec=pipeline.runtime_spec,
                executor_spec=executor_spec,
                custom_driver_spec=custom_driver_spec,
            )
            start = time.time()
            logger.info(f"Step `{node_id}` has started.")
            component_launcher.launch()
            end = time.time()
            logger.info(
                f"Step `{node_id}` has finished"
                f" in {format_timedelta_pretty(end - start)}."
            )
Example #19
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            run_name: Optional[str] = None) -> "airflow.DAG":
        """Deploys given logical pipeline on Airflow.

        Args:
          pipeline: Logical pipeline containing pipeline args and comps.
          run_name: Optional name for the run.

        Returns:
          An Airflow DAG.
        """
        # Only import these when needed.
        import airflow  # noqa

        from zenml.integrations.airflow.orchestrators import airflow_component

        # Merge airflow-specific configs with pipeline args

        airflow_dag = airflow.DAG(
            dag_id=pipeline.pipeline_info.pipeline_name,
            **(typing.cast(AirflowPipelineConfig,
                           self._config).airflow_dag_config),
            is_paused_upon_creation=False,
            catchup=False,  # no backfill
        )
        if "tmp_dir" not in pipeline.additional_pipeline_args:
            tmp_dir = os.path.join(pipeline.pipeline_info.pipeline_root,
                                   ".temp", "")
            pipeline.additional_pipeline_args["tmp_dir"] = tmp_dir

        for component in pipeline.components:
            if isinstance(component, base_component.BaseComponent):
                component._resolve_pip_dependencies(
                    pipeline.pipeline_info.pipeline_root)
            self._replace_runtime_params(component)

        c = compiler.Compiler()
        pipeline = c.compile(pipeline)

        run_name = run_name or datetime.now().strftime("%d_%h_%y-%H_%M_%S_%f")
        # Substitute the runtime parameter to be a concrete run_id
        runtime_parameter_utils.substitute_runtime_parameter(
            pipeline,
            {
                "pipeline-run-id": run_name,
            },
        )
        deployment_config = runner_utils.extract_local_deployment_config(
            pipeline)
        connection_config = deployment_config.metadata_connection_config  # type: ignore[attr-defined] # noqa

        component_impl_map = {}

        for node in pipeline.nodes:
            pipeline_node = node.pipeline_node
            node_id = pipeline_node.node_info.id
            executor_spec = runner_utils.extract_executor_spec(
                deployment_config, node_id)
            custom_driver_spec = runner_utils.extract_custom_driver_spec(
                deployment_config, node_id)

            current_airflow_component = airflow_component.AirflowComponent(
                parent_dag=airflow_dag,
                pipeline_node=pipeline_node,
                mlmd_connection=connection_config,
                pipeline_info=pipeline.pipeline_info,
                pipeline_runtime_spec=pipeline.runtime_spec,
                executor_spec=executor_spec,
                custom_driver_spec=custom_driver_spec,
            )
            component_impl_map[node_id] = current_airflow_component
            for upstream_node in node.pipeline_node.upstream_nodes:
                assert (upstream_node in component_impl_map
                        ), "Components is not in topological order"
                current_airflow_component.set_upstream(
                    component_impl_map[upstream_node])

        return airflow_dag
  def testStubbedImdbPipelineBeam(self):
    pipeline_ir = compiler.Compiler().compile(self.imdb_pipeline)

    pipeline_mock.replace_executor_with_stub(pipeline_ir,
                                             self._recorded_output_dir, [])

    BeamDagRunner().run(pipeline_ir)

    self.assertTrue(fileio.exists(self._metadata_path))

    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)

    # Verify that recorded files are successfully copied to the output uris.
    with metadata.Metadata(metadata_config) as m:
      for execution in m.store.get_executions():
        component_id = pipeline_recorder_utils.get_component_id_from_execution(
            m, execution)
        if component_id.startswith('ResolverNode'):
          continue
        eid = [execution.id]
        events = m.store.get_events_by_execution_ids(eid)
        output_events = [
            x for x in events if x.type == metadata_store_pb2.Event.OUTPUT
        ]
        for event in output_events:
          steps = event.path.steps
          assert steps[0].HasField('key')
          name = steps[0].key
          artifacts = m.store.get_artifacts_by_id([event.artifact_id])
          for idx, artifact in enumerate(artifacts):
            self.assertDirectoryEqual(
                artifact.uri,
                os.path.join(self._recorded_output_dir, component_id, name,
                             str(idx)))

    # Calls verifier for pipeline output artifacts, excluding the resolver node.
    BeamDagRunner().run(self.imdb_pipeline)
    pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
        self.imdb_pipeline.metadata_connection_config,
        self._pipeline_name)

    verifier_map = {
        'model': self._verify_model,
        'model_run': self._verify_model,
        'examples': self._verify_examples,
        'schema': self._verify_schema,
        'anomalies': self._verify_anomalies,
        'evaluation': self._verify_evaluation,
        # A subdirectory of updated_analyzer_cache has changing name.
        'updated_analyzer_cache': self._veryify_root_dir,
    }

    # List of components to verify. ResolverNode is ignored because it
    # doesn't have an executor.
    verify_component_ids = [
        component.id
        for component in self.imdb_pipeline.components
        if not component.id.startswith('ResolverNode')
    ]

    for component_id in verify_component_ids:
      for key, artifact_dict in pipeline_outputs[component_id].items():
        for idx, artifact in artifact_dict.items():
          logging.info('Verifying %s', component_id)
          recorded_uri = os.path.join(self._recorded_output_dir, component_id,
                                      key, str(idx))
          verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                        recorded_uri)
Example #21
0
 def _getTestPipelineIR(self) -> pipeline_pb2.Pipeline:
   test_pipeline = self._getTestPipeline()
   c = compiler.Compiler()
   return c.compile(test_pipeline)
    def testStubbedTaxiPipelineBeam(self):
        pipeline_ir = compiler.Compiler().compile(self.taxi_pipeline)

        logging.info('Replacing with test_data_dir:%s',
                     self._recorded_output_dir)
        pipeline_mock.replace_executor_with_stub(pipeline_ir,
                                                 self._recorded_output_dir, [])

        BeamDagRunner().run_with_ir(pipeline_ir)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 7 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache, pre_transform_statistics,
            # pre_transform_schema, post_transform_statistics, post_transform_schema,
            # post_transform_anomalies) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 7)
            self.assertLen(self.taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = pipeline_recorder_utils.get_component_id_from_execution(
                    m, execution)
                if component_id.startswith('Resolver'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        BeamDagRunner().run(self.taxi_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.taxi_pipeline.metadata_connection_config, self._pipeline_name)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation,
            # A subdirectory of updated_analyzer_cache has changing name.
            'updated_analyzer_cache': self._veryify_root_dir,
        }

        # List of components to verify. Resolver is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.taxi_pipeline.components
            if not component.id.startswith('Resolver')
        ]

        for component_id in verify_component_ids:
            logging.info('Verifying %s', component_id)
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
Example #23
0
 def _getTestPipelineIR(self) -> pipeline_pb2.Pipeline:  # pylint: disable=invalid-name
     test_pipeline = self._getTestPipeline()
     c = compiler.Compiler()
     return c.compile(test_pipeline)