コード例 #1
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.pipeline import PipelineVisitor
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry
    from apache_beam.testing.test_stream import TestStream

    # Performing configured PTransform overrides.
    pipeline.replace_all(_get_transform_overrides(pipeline.options))

    # If the TestStream I/O is used, use a mock test clock.
    class _TestStreamUsageVisitor(PipelineVisitor):
      """Visitor determining whether a Pipeline uses a TestStream."""

      def __init__(self):
        self.uses_test_stream = False

      def visit_transform(self, applied_ptransform):
        if isinstance(applied_ptransform.transform, TestStream):
          self.uses_test_stream = True

    visitor = _TestStreamUsageVisitor()
    pipeline.visit(visitor)
    clock = TestClock() if visitor.uses_test_stream else RealClock()

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    return result
コード例 #2
0
    def _r(runner, options, seeds):
        bigquery.truncate(seeds)
        bigquery.seed(seeds)

        RuntimeValueProvider.set_runtime_options(None)

        runner._run(TestPipeline(options=options), options)
コード例 #3
0
ファイル: direct_runner.py プロジェクト: xsm110/Beam15.0
    def run_pipeline(self, pipeline, options):
        """Execute the entire pipeline and returns an DirectPipelineResult."""

        # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
        # with resolving imports when they are at top.
        # pylint: disable=wrong-import-position
        from apache_beam.pipeline import PipelineVisitor
        from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
          ConsumerTrackingPipelineVisitor
        from apache_beam.runners.direct.evaluation_context import EvaluationContext
        from apache_beam.runners.direct.executor import Executor
        from apache_beam.runners.direct.transform_evaluator import \
          TransformEvaluatorRegistry
        from apache_beam.testing.test_stream import TestStream

        # Performing configured PTransform overrides.
        pipeline.replace_all(_get_transform_overrides(options))

        # If the TestStream I/O is used, use a mock test clock.
        class _TestStreamUsageVisitor(PipelineVisitor):
            """Visitor determining whether a Pipeline uses a TestStream."""
            def __init__(self):
                self.uses_test_stream = False

            def visit_transform(self, applied_ptransform):
                if isinstance(applied_ptransform.transform, TestStream):
                    self.uses_test_stream = True

        visitor = _TestStreamUsageVisitor()
        pipeline.visit(visitor)
        clock = TestClock() if visitor.uses_test_stream else RealClock()

        # TODO(BEAM-4274): Circular import runners-metrics. Requires refactoring.
        from apache_beam.metrics.execution import MetricsEnvironment
        MetricsEnvironment.set_metrics_supported(True)
        logging.info('Running pipeline with DirectRunner.')
        self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
        pipeline.visit(self.consumer_tracking_visitor)

        evaluation_context = EvaluationContext(
            options,
            BundleFactory(stacked=options.view_as(
                DirectOptions).direct_runner_use_stacked_bundle),
            self.consumer_tracking_visitor.root_transforms,
            self.consumer_tracking_visitor.value_to_consumers,
            self.consumer_tracking_visitor.step_names,
            self.consumer_tracking_visitor.views, clock)

        executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                            TransformEvaluatorRegistry(evaluation_context),
                            evaluation_context)
        # DirectRunner does not support injecting
        # PipelineOptions values at runtime
        RuntimeValueProvider.set_runtime_options({})
        # Start the executor. This is a non-blocking call, it will start the
        # execution in background threads and return.
        executor.start(self.consumer_tracking_visitor.root_transforms)
        result = DirectPipelineResult(executor, evaluation_context)

        return result
コード例 #4
0
def test_inserting_the_dest_table_schema_into_pcollection_runtime():
    with TestPipeline() as p:
        lake_table = RuntimeValueProvider(
            option_name='dest',
            value_type=str,
            default_value=f'{project_id}:lake.wrench_metrics')
        expected = [{
            'schema': [
                gcp_bq.schema.SchemaField('entity_id', 'STRING', 'REQUIRED',
                                          None, ()),
                gcp_bq.schema.SchemaField('tree_user_id', 'INTEGER',
                                          'REQUIRED', None, ()),
                gcp_bq.schema.SchemaField('prediction', 'STRING', 'REQUIRED',
                                          None, ()),
                gcp_bq.schema.SchemaField('client_wrench_id', 'STRING',
                                          'REQUIRED', None, ()),
                gcp_bq.schema.SchemaField('expirement_name', 'STRING',
                                          'NULLABLE', None, ()),
                gcp_bq.schema.SchemaField('processing_datetime', 'DATETIME',
                                          'NULLABLE', None, ()),
                gcp_bq.schema.SchemaField('ingestion_timestamp', 'TIMESTAMP',
                                          'REQUIRED', None, ())
            ],
            'payload': {}
        }]
        pcoll = p | beam.Create([{}])
        schema_pcoll = pcoll | beam.ParDo(
            bq.IngectTableSchema(table=lake_table))
        assert_that(schema_pcoll, equal_to(expected))
        RuntimeValueProvider.set_runtime_options(None)
コード例 #5
0
    def test_experiments_setup(self):
        self.assertFalse('feature_1' in RuntimeValueProvider.experiments)

        RuntimeValueProvider.set_runtime_options(
            {'experiments': ['feature_1', 'feature_2']})
        self.assertTrue(isinstance(RuntimeValueProvider.experiments, set))
        self.assertTrue('feature_1' in RuntimeValueProvider.experiments)
        self.assertTrue('feature_2' in RuntimeValueProvider.experiments)
コード例 #6
0
 def test_experiments_setup(self):
     RuntimeValueProvider.set_runtime_options(
         {'experiments': ['feature_1', 'feature_2']})
     self.assertTrue(isinstance(RuntimeValueProvider.experiments, set))
     self.assertTrue('feature_1' in RuntimeValueProvider.experiments)
     self.assertTrue('feature_2' in RuntimeValueProvider.experiments)
     # Clean up runtime_options after this test case finish, otherwise, it'll
     # affect other cases since runtime_options is static attr
     RuntimeValueProvider.set_runtime_options(None)
コード例 #7
0
  def test_get_destination_uri_runtime_vp(self):
    # Provide values at job-execution time.
    RuntimeValueProvider.set_runtime_options({'gcs_location': 'gs://bucket'})
    options = self.UserDefinedOptions()
    unique_id = uuid.uuid4().hex

    uri = bigquery_export_destination_uri(options.gcs_location, None, unique_id)
    self.assertEqual(
        uri, 'gs://bucket/' + unique_id + '/bigquery-table-dump-*.json')
コード例 #8
0
  def test_get_destination_uri_empty_runtime_vp(self):
    with self.assertRaisesRegex(ValueError,
                                '^ReadFromBigQuery requires a GCS '
                                'location to be provided'):
      # Don't provide any runtime values.
      RuntimeValueProvider.set_runtime_options({})
      options = self.UserDefinedOptions()

      bigquery_export_destination_uri(
          options.gcs_location, None, uuid.uuid4().hex)
コード例 #9
0
    def test_set_runtime_option(self):
        # define ValueProvider options, with and without default values
        class UserDefinedOptions1(PipelineOptions):
            @classmethod
            def _add_argparse_args(cls, parser):
                parser.add_value_provider_argument(
                    '--vpt_vp_arg6',
                    help='This keyword argument is a value provider'
                )  # set at runtime

                parser.add_value_provider_argument(  # not set, had default int
                    '-v',
                    '--vpt_vp_arg7',  # with short form
                    default=123,
                    type=int)

                parser.add_value_provider_argument(  # not set, had default str
                    '--vpt_vp-arg8',  # with dash in name
                    default='123',
                    type=str)

                parser.add_value_provider_argument(  # not set and no default
                    '--vpt_vp_arg9', type=float)

                parser.add_value_provider_argument(  # positional argument set
                    'vpt_vp_arg10',  # default & runtime ignored
                    help='This positional argument is a value provider',
                    type=float,
                    default=5.4)

        # provide values at graph-construction time
        # (options not provided here become of the type RuntimeValueProvider)
        options = UserDefinedOptions1(['1.2'])
        self.assertFalse(options.vpt_vp_arg6.is_accessible())
        self.assertFalse(options.vpt_vp_arg7.is_accessible())
        self.assertFalse(options.vpt_vp_arg8.is_accessible())
        self.assertFalse(options.vpt_vp_arg9.is_accessible())
        self.assertTrue(options.vpt_vp_arg10.is_accessible())

        # provide values at job-execution time
        # (options not provided here will use their default, if they have one)
        RuntimeValueProvider.set_runtime_options({
            'vpt_vp_arg6': 'abc',
            'vpt_vp_arg10': '3.2'
        })
        self.assertTrue(options.vpt_vp_arg6.is_accessible())
        self.assertEqual(options.vpt_vp_arg6.get(), 'abc')
        self.assertTrue(options.vpt_vp_arg7.is_accessible())
        self.assertEqual(options.vpt_vp_arg7.get(), 123)
        self.assertTrue(options.vpt_vp_arg8.is_accessible())
        self.assertEqual(options.vpt_vp_arg8.get(), '123')
        self.assertTrue(options.vpt_vp_arg9.is_accessible())
        self.assertIsNone(options.vpt_vp_arg9.get())
        self.assertTrue(options.vpt_vp_arg10.is_accessible())
        self.assertEqual(options.vpt_vp_arg10.get(), 1.2)
コード例 #10
0
ファイル: fn_runner.py プロジェクト: dr-manojksingh/beam
    def run_pipeline(
            self,
            pipeline,  # type: Pipeline
            options  # type: pipeline_options.PipelineOptions
    ):
        # type: (...) -> RunnerResult
        RuntimeValueProvider.set_runtime_options({})

        # Setup "beam_fn_api" experiment options if lacked.
        experiments = (options.view_as(
            pipeline_options.DebugOptions).experiments or [])
        if not 'beam_fn_api' in experiments:
            experiments.append('beam_fn_api')
        options.view_as(
            pipeline_options.DebugOptions).experiments = experiments

        # This is sometimes needed if type checking is disabled
        # to enforce that the inputs (and outputs) of GroupByKey operations
        # are known to be KVs.
        from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
        # TODO: Move group_by_key_input_visitor() to a non-dataflow specific file.
        pipeline.visit(
            DataflowRunner.group_by_key_input_visitor(
                not options.view_as(pipeline_options.TypeOptions
                                    ).allow_non_deterministic_key_coders))
        self._bundle_repeat = self._bundle_repeat or options.view_as(
            pipeline_options.DirectOptions).direct_runner_bundle_repeat
        pipeline_direct_num_workers = options.view_as(
            pipeline_options.DirectOptions).direct_num_workers
        if pipeline_direct_num_workers == 0:
            self._num_workers = multiprocessing.cpu_count()
        else:
            self._num_workers = pipeline_direct_num_workers or self._num_workers

        # set direct workers running mode if it is defined with pipeline options.
        running_mode = \
          options.view_as(pipeline_options.DirectOptions).direct_running_mode
        if running_mode == 'multi_threading':
            self._default_environment = environments.EmbeddedPythonGrpcEnvironment(
            )
        elif running_mode == 'multi_processing':
            command_string = '%s -m apache_beam.runners.worker.sdk_worker_main' \
                          % sys.executable
            self._default_environment = environments.SubprocessSDKEnvironment(
                command_string=command_string)

        self._profiler_factory = Profile.factory_from_options(
            options.view_as(pipeline_options.ProfilingOptions))

        self._latest_run_result = self.run_via_runner_api(
            pipeline.to_runner_api(
                default_environment=self._default_environment))
        return self._latest_run_result
コード例 #11
0
ファイル: value_provider_test.py プロジェクト: onderson/beam
  def test_experiments_setup(self):
    self.assertFalse('feature_1' in RuntimeValueProvider.experiments)

    RuntimeValueProvider.set_runtime_options(
        {'experiments': ['feature_1', 'feature_2']}
    )
    self.assertTrue(isinstance(RuntimeValueProvider.experiments, set))
    self.assertTrue('feature_1' in RuntimeValueProvider.experiments)
    self.assertTrue('feature_2' in RuntimeValueProvider.experiments)
    # Clean up runtime_options after this test case finish, otherwise, it'll
    # affect other cases since runtime_options is static attr
    RuntimeValueProvider.set_runtime_options(None)
コード例 #12
0
  def test_set_runtime_option(self):
    # define ValueProvider ptions, with and without default values
    class UserDefinedOptions1(PipelineOptions):
      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_value_provider_argument(
            '--vpt_vp_arg6',
            help='This keyword argument is a value provider')   # set at runtime

        parser.add_value_provider_argument(         # not set, had default int
            '-v', '--vpt_vp_arg7',                      # with short form
            default=123,
            type=int)

        parser.add_value_provider_argument(         # not set, had default str
            '--vpt_vp-arg8',                            # with dash in name
            default='123',
            type=str)

        parser.add_value_provider_argument(         # not set and no default
            '--vpt_vp_arg9',
            type=float)

        parser.add_value_provider_argument(         # positional argument set
            'vpt_vp_arg10',                         # default & runtime ignored
            help='This positional argument is a value provider',
            type=float,
            default=5.4)

    # provide values at graph-construction time
    # (options not provided here become of the type RuntimeValueProvider)
    options = UserDefinedOptions1(['1.2'])
    self.assertFalse(options.vpt_vp_arg6.is_accessible())
    self.assertFalse(options.vpt_vp_arg7.is_accessible())
    self.assertFalse(options.vpt_vp_arg8.is_accessible())
    self.assertFalse(options.vpt_vp_arg9.is_accessible())
    self.assertTrue(options.vpt_vp_arg10.is_accessible())

    # provide values at job-execution time
    # (options not provided here will use their default, if they have one)
    RuntimeValueProvider.set_runtime_options({'vpt_vp_arg6': 'abc',
                                              'vpt_vp_arg10':'3.2'})
    self.assertTrue(options.vpt_vp_arg6.is_accessible())
    self.assertEqual(options.vpt_vp_arg6.get(), 'abc')
    self.assertTrue(options.vpt_vp_arg7.is_accessible())
    self.assertEqual(options.vpt_vp_arg7.get(), 123)
    self.assertTrue(options.vpt_vp_arg8.is_accessible())
    self.assertEqual(options.vpt_vp_arg8.get(), '123')
    self.assertTrue(options.vpt_vp_arg9.is_accessible())
    self.assertIsNone(options.vpt_vp_arg9.get())
    self.assertTrue(options.vpt_vp_arg10.is_accessible())
    self.assertEqual(options.vpt_vp_arg10.get(), 1.2)
コード例 #13
0
  def run_pipeline(self, pipeline):
    """Execute the entire pipeline and returns an DirectPipelineResult."""

    # Performing configured PTransform overrides.
    pipeline.replace_all(self._ptransform_overrides)

    # TODO: Move imports to top. Pipeline <-> Runner dependency cause problems
    # with resolving imports when they are at top.
    # pylint: disable=wrong-import-position
    from apache_beam.runners.direct.consumer_tracking_pipeline_visitor import \
      ConsumerTrackingPipelineVisitor
    from apache_beam.runners.direct.evaluation_context import EvaluationContext
    from apache_beam.runners.direct.executor import Executor
    from apache_beam.runners.direct.transform_evaluator import \
      TransformEvaluatorRegistry

    MetricsEnvironment.set_metrics_supported(True)
    logging.info('Running pipeline with DirectRunner.')
    self.consumer_tracking_visitor = ConsumerTrackingPipelineVisitor()
    pipeline.visit(self.consumer_tracking_visitor)

    clock = TestClock() if self._use_test_clock else RealClock()
    evaluation_context = EvaluationContext(
        pipeline._options,
        BundleFactory(stacked=pipeline._options.view_as(DirectOptions)
                      .direct_runner_use_stacked_bundle),
        self.consumer_tracking_visitor.root_transforms,
        self.consumer_tracking_visitor.value_to_consumers,
        self.consumer_tracking_visitor.step_names,
        self.consumer_tracking_visitor.views,
        clock)

    evaluation_context.use_pvalue_cache(self._cache)

    executor = Executor(self.consumer_tracking_visitor.value_to_consumers,
                        TransformEvaluatorRegistry(evaluation_context),
                        evaluation_context)
    # DirectRunner does not support injecting
    # PipelineOptions values at runtime
    RuntimeValueProvider.set_runtime_options({})
    # Start the executor. This is a non-blocking call, it will start the
    # execution in background threads and return.
    executor.start(self.consumer_tracking_visitor.root_transforms)
    result = DirectPipelineResult(executor, evaluation_context)

    if self._cache:
      # We are running in eager mode, block until the pipeline execution
      # completes in order to have full results in the cache.
      result.wait_until_finish()
      self._cache.finalize()

    return result
コード例 #14
0
ファイル: fn_api_runner.py プロジェクト: lyft/beam
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
コード例 #15
0
 def run_pipeline(self, pipeline, options):
   MetricsEnvironment.set_metrics_supported(False)
   RuntimeValueProvider.set_runtime_options({})
   # This is sometimes needed if type checking is disabled
   # to enforce that the inputs (and outputs) of GroupByKey operations
   # are known to be KVs.
   from apache_beam.runners.dataflow.dataflow_runner import DataflowRunner
   pipeline.visit(DataflowRunner.group_by_key_input_visitor())
   self._bundle_repeat = self._bundle_repeat or options.view_as(
       pipeline_options.DirectOptions).direct_runner_bundle_repeat
   self._profiler_factory = profiler.Profile.factory_from_options(
       options.view_as(pipeline_options.ProfilingOptions))
   return self.run_via_runner_api(pipeline.to_runner_api(
       default_environment=self._default_environment))
コード例 #16
0
  def test_get_destination_uri_fallback_temp_location(self):
    # Don't provide any runtime values.
    RuntimeValueProvider.set_runtime_options({})
    options = self.UserDefinedOptions()

    with self.assertLogs('apache_beam.io.gcp.bigquery_read_internal',
                         level='DEBUG') as context:
      bigquery_export_destination_uri(
          options.gcs_location, 'gs://bucket', uuid.uuid4().hex)
    self.assertEqual(
        context.output,
        [
            'DEBUG:apache_beam.io.gcp.bigquery_read_internal:gcs_location is '
            'empty, using temp_location instead'
        ])
コード例 #17
0
ファイル: sideinputs_test.py プロジェクト: onderson/beam
  def test_bytes_read_are_reported(self):
    RuntimeValueProvider.set_runtime_options(
        {'experiments': ['sideinput_io_metrics_v2', 'other']})
    mock_read_counter = mock.MagicMock()
    source_records = ['a', 'b', 'c', 'd']
    sources = [
        FakeSource(source_records, notify_observers=True),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(
        sources, max_reader_threads=3, read_counter=mock_read_counter)
    assert list(strip_windows(iterator_fn())) == source_records
    mock_read_counter.add_bytes_read.assert_called_with(4)

    # Remove runtime options from the runtime value provider.
    RuntimeValueProvider.set_runtime_options({})
コード例 #18
0
  def test_bytes_read_are_reported(self):
    RuntimeValueProvider.set_runtime_options(
        {'experiments': 'sideinput_io_metrics,other'})
    mock_read_counter = mock.MagicMock()
    source_records = ['a', 'b', 'c', 'd']
    sources = [
        FakeSource(source_records, notify_observers=True),
    ]
    iterator_fn = sideinputs.get_iterator_fn_for_sources(
        sources, max_reader_threads=3, read_counter=mock_read_counter)
    assert list(strip_windows(iterator_fn())) == source_records
    mock_read_counter.add_bytes_read.assert_called_with(4)

    # Remove runtime options from the runtime value provider.
    RuntimeValueProvider.set_runtime_options({})
コード例 #19
0
  def test_nested_value_provider_wrap_runtime(self):
    class UserDefinedOptions(PipelineOptions):
      @classmethod
      def _add_argparse_args(cls, parser):
        parser.add_value_provider_argument(
            '--vpt_vp_arg15',
            help='This keyword argument is a value provider')  # set at runtime

    options = UserDefinedOptions([])
    vp = NestedValueProvider(options.vpt_vp_arg15, lambda x: x + x)
    self.assertFalse(vp.is_accessible())

    RuntimeValueProvider.set_runtime_options({'vpt_vp_arg15': 'abc'})

    self.assertTrue(vp.is_accessible())
    self.assertEqual(vp.get(), 'abcabc')
コード例 #20
0
def test_runtime_serialized_file_list_is_deserialized_and_processed_by_insertion_order(
        cloudstorage):
    with TestPipeline() as p:
        bucket = f'{project_id}-cdc-imports'

        # Update sort_key based on the filename format
        def _sort_key(f):
            delimeter = '-'
            ts = f[f.rfind(delimeter) + 1:]
            return int(ts) if ts.isdigit() else f

        _sort_key = bytes.hex(dill.dumps(_sort_key))

        runtime_env = RuntimeValueProvider(option_name='env',
                                           value_type=str,
                                           default_value='local')
        runtime_bucket = RuntimeValueProvider(option_name='bucket',
                                              value_type=str,
                                              default_value=bucket)
        runtime_startswith = RuntimeValueProvider(
            option_name='files_startwith',
            value_type=str,
            default_value='vibe-tree-user-statuses-final')
        runtime_sort_key = RuntimeValueProvider(option_name='sort_key',
                                                value_type=str,
                                                default_value=_sort_key)
        [b.delete() for b in cloudstorage.client.list_blobs(bucket)]
        file_paths = [
            'vibe-tree-user-statuses-final-0083c-1987582612499',
            'vibe-tree-user-statuses-final-003c-1587582612405',
            'vibe-order-items-final-0030dd8697-1588231505823'
        ]
        expected_output = [
            'gs://icentris-ml-local-wbrito-cdc-imports/vibe-tree-user-statuses-final-003c-1587582612405',
            'gs://icentris-ml-local-wbrito-cdc-imports/vibe-tree-user-statuses-final-0083c-1987582612499'
        ]
        for f in file_paths:
            cloudstorage.client.upload_blob_from_string(bucket, f, f)

        p_paths = p | FileListIteratorTransform(
            env=runtime_env,
            bucket=runtime_bucket,
            files_startwith=runtime_startswith,
            sort_key=runtime_sort_key)
        assert_that(p_paths, equal_to(expected_output))
        RuntimeValueProvider.set_runtime_options(None)
コード例 #21
0
    def test_string_or_value_provider_only(self):
        str_file_pattern = tempfile.NamedTemporaryFile(delete=False).name
        self.assertEqual(str_file_pattern,
                         FileBasedSource(str_file_pattern)._pattern.value)

        static_vp_file_pattern = StaticValueProvider(value_type=str,
                                                     value=str_file_pattern)
        self.assertEqual(static_vp_file_pattern,
                         FileBasedSource(static_vp_file_pattern)._pattern)

        runtime_vp_file_pattern = RuntimeValueProvider(
            option_name='arg', value_type=str, default_value=str_file_pattern)
        self.assertEqual(runtime_vp_file_pattern,
                         FileBasedSource(runtime_vp_file_pattern)._pattern)
        # Reset runtime options to avoid side-effects in other tests.
        RuntimeValueProvider.set_runtime_options(None)

        invalid_file_pattern = 123
        with self.assertRaises(TypeError):
            FileBasedSource(invalid_file_pattern)
コード例 #22
0
def ndjson(env, cloudstorage, record_testsuite_property):
    cloudstorage.client.delete_blob(bucket, dest_blob_name)
    assert cloudstorage.client.blob_exists(bucket, dest_blob_name) is False

    sql = BigQuery.querybuilder(union=('all', [
        BigQuery.querybuilder(
            select=[('NULL',
                     'none'), ('True', 'true_bool'), (
                         'False', 'false_bool'), (
                             '"2020-04-03"',
                             'date'), ('"2020-04-03 13:45:00"', 'datetime'),
                    ('"1966-06-06 06:06:06.666666 UTC"',
                     'timestamp'), ('"STRING"',
                                    'string'), ('234',
                                                'integer'), ('123.54',
                                                             'float')]),
        BigQuery.querybuilder(select=['NULL'] * 9),
        BigQuery.querybuilder(select=[
            '"String"', 'False', 'True', '"1993-09-03"',
            '"1993-09-03 03:44:00"', '"1993-09-03 03:44:00.777555 UTC"',
            '"Not String"', '567', '456'
        ])
    ]))

    RuntimeValueProvider.set_runtime_options(None)

    options = RuntimeOptions([
        '--env', env['env'], '--query',
        str(sql), '--destination', f'gs://{bucket}/{blob_name}'
    ])
    Runner._run(TestPipeline(options=options), options)

    assert cloudstorage.client.blob_exists(bucket, dest_blob_name) is True

    zbytes = cloudstorage.client.download_blob_as_string(
        bucket, dest_blob_name)
    bytes = gzip.decompress(zbytes)
    lns = bytes.decode('utf8').rstrip().split('\n')
    yield [json.loads(l) for l in lns]
コード例 #23
0
 def test_runtime_value_provider_to(self):
     RuntimeValueProvider.set_runtime_options(None)
     rvp = RuntimeValueProvider('arg', 123, int)
     self.assertEqual(JsonValue(is_null=True), to_json_value(rvp))
     # Reset runtime options to avoid side-effects in other tests.
     RuntimeValueProvider.set_runtime_options(None)
コード例 #24
0
 def tearDown(self):
     # Reset runtime options to avoid side-effects in other tests.
     RuntimeValueProvider.set_runtime_options(None)
コード例 #25
0
 def setUp(self):
     # Reset runtime options to avoid side-effects caused by other tests.
     # Note that is_accessible assertions require runtime_options to
     # be uninitialized.
     RuntimeValueProvider.set_runtime_options(None)
コード例 #26
0
def main(unused_argv):
    """Main entry point for SDK Fn Harness."""
    if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ:
        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
            text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'],
                              logging_service_descriptor)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            # TODO(BEAM-5468): This should be picked up from pipeline options.
            logging.getLogger().setLevel(logging.DEBUG)
            logging.getLogger().addHandler(fn_log_handler)
            logging.info('Logging handler created.')
        except Exception:
            logging.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None
    else:
        fn_log_handler = None

    # Start status HTTP server thread.
    thread = threading.Thread(name='status_http_server',
                              target=StatusServer().start)
    thread.daemon = True
    thread.setName('status-server-demon')
    thread.start()

    if 'PIPELINE_OPTIONS' in os.environ:
        sdk_pipeline_options = _parse_pipeline_options(
            os.environ['PIPELINE_OPTIONS'])
    else:
        sdk_pipeline_options = PipelineOptions.from_dictionary({})

    if 'SEMI_PERSISTENT_DIRECTORY' in os.environ:
        semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY']
    else:
        semi_persistent_directory = None

    logging.info('semi_persistent_directory: %s', semi_persistent_directory)
    _worker_id = os.environ.get('WORKER_ID', None)

    try:
        _load_main_session(semi_persistent_directory)
    except Exception:  # pylint: disable=broad-except
        exception_details = traceback.format_exc()
        logging.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)

    try:
        logging.info('Python sdk harness started with pipeline_options: %s',
                     sdk_pipeline_options.get_all_options(drop_default=True))
        RuntimeValueProvider.set_runtime_options(
            sdk_pipeline_options.view_as(
                pipeline_options.HadoopFileSystemOptions).get_all_options())
        service_descriptor = endpoints_pb2.ApiServiceDescriptor()
        text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'],
                          service_descriptor)
        # TODO(robertwb): Support credentials.
        assert not service_descriptor.oauth2_client_credentials_grant.url
        SdkHarness(control_address=service_descriptor.url,
                   worker_count=_get_worker_count(sdk_pipeline_options),
                   worker_id=_worker_id,
                   profiler_factory=profiler.Profile.factory_from_options(
                       sdk_pipeline_options.view_as(
                           pipeline_options.ProfilingOptions))).run()
        logging.info('Python sdk harness exiting.')
    except:  # pylint: disable=broad-except
        logging.exception('Python sdk harness failed: ')
        raise
    finally:
        if fn_log_handler:
            fn_log_handler.close()
コード例 #27
0
ファイル: value_provider_test.py プロジェクト: xubii/beam
 def setUp(self):
     # Reset runtime options, since the is_accessible assertions require them to
     # be uninitialized.
     RuntimeValueProvider.set_runtime_options(None)
コード例 #28
0
def create_harness(environment, dry_run=False):
    """Creates SDK Fn Harness."""
    if 'LOGGING_API_SERVICE_DESCRIPTOR' in environment:
        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
            text_format.Merge(environment['LOGGING_API_SERVICE_DESCRIPTOR'],
                              logging_service_descriptor)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            # TODO(BEAM-5468): This should be picked up from pipeline options.
            logging.getLogger().setLevel(logging.INFO)
            logging.getLogger().addHandler(fn_log_handler)
            _LOGGER.info('Logging handler created.')
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None
    else:
        fn_log_handler = None

    pipeline_options_dict = _load_pipeline_options(
        environment.get('PIPELINE_OPTIONS'))
    # These are used for dataflow templates.
    RuntimeValueProvider.set_runtime_options(pipeline_options_dict)
    sdk_pipeline_options = PipelineOptions.from_dictionary(
        pipeline_options_dict)
    filesystems.FileSystems.set_options(sdk_pipeline_options)

    if 'SEMI_PERSISTENT_DIRECTORY' in environment:
        semi_persistent_directory = environment['SEMI_PERSISTENT_DIRECTORY']
    else:
        semi_persistent_directory = None

    _LOGGER.info('semi_persistent_directory: %s', semi_persistent_directory)
    _worker_id = environment.get('WORKER_ID', None)

    try:
        _load_main_session(semi_persistent_directory)
    except CorruptMainSessionException:
        exception_details = traceback.format_exc()
        _LOGGER.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)
        raise
    except Exception:  # pylint: disable=broad-except
        exception_details = traceback.format_exc()
        _LOGGER.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)

    _LOGGER.info('Pipeline_options: %s',
                 sdk_pipeline_options.get_all_options(drop_default=True))
    control_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    text_format.Merge(environment['CONTROL_API_SERVICE_DESCRIPTOR'],
                      control_service_descriptor)
    if 'STATUS_API_SERVICE_DESCRIPTOR' in environment:
        text_format.Merge(environment['STATUS_API_SERVICE_DESCRIPTOR'],
                          status_service_descriptor)
    # TODO(robertwb): Support authentication.
    assert not control_service_descriptor.HasField('authentication')

    experiments = sdk_pipeline_options.view_as(DebugOptions).experiments or []
    enable_heap_dump = 'enable_heap_dump' in experiments
    if dry_run:
        return
    sdk_harness = SdkHarness(
        control_address=control_service_descriptor.url,
        status_address=status_service_descriptor.url,
        worker_id=_worker_id,
        state_cache_size=_get_state_cache_size(experiments),
        data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments),
        profiler_factory=profiler.Profile.factory_from_options(
            sdk_pipeline_options.view_as(ProfilingOptions)),
        enable_heap_dump=enable_heap_dump)
    return fn_log_handler, sdk_harness