def test_unsupported_type_display_data(self):
    class MyDisplayComponent(HasDisplayData):
      def display_data(self):
        return {'item_key': 'item_value'}

    with self.assertRaises(ValueError):
      DisplayData.create_from_options(MyDisplayComponent())
Example #2
0
 def test_project_table_display_data(self):
   sinkq = beam.io.BigQuerySink('PROJECT:dataset.table')
   dd = DisplayData.create_from(sinkq)
   expected_items = [
       DisplayDataItemMatcher('table', 'PROJECT:dataset.table'),
       DisplayDataItemMatcher('validation', False)]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #3
0
 def individual_test_per_key_dd(sampleFn, n):
   trs = [sampleFn(n)]
   for transform in trs:
     dd = DisplayData.create_from(transform)
     hc.assert_that(
         dd.items,
         hc.contains_inanyorder(DisplayDataItemMatcher('n', transform._n)))
 def test_create_list_display_data(self):
   flags = ['--extra_package', 'package1', '--extra_package', 'package2']
   pipeline_options = PipelineOptions(flags=flags)
   items = DisplayData.create_from_options(pipeline_options).items
   hc.assert_that(items, hc.contains_inanyorder(
       DisplayDataItemMatcher('extra_packages',
                              str(['package1', 'package2']))))
Example #5
0
 def test_sink_display_data(self):
   file_name = 'some_avro_sink'
   sink = _create_avro_sink(
       file_name,
       self.SCHEMA,
       'null',
       '.end',
       0,
       None,
       'application/x-avro',
       use_fastavro=self.use_fastavro)
   dd = DisplayData.create_from(sink)
   expected_items = [
       DisplayDataItemMatcher(
           'schema',
           str(self.SCHEMA)),
       DisplayDataItemMatcher(
           'file_pattern',
           'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'),
       DisplayDataItemMatcher(
           'codec',
           'null'),
       DisplayDataItemMatcher(
           'compression',
           'uncompressed')]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #6
0
  def test_display_data(self):
    sink = PubSubSink('a_topic')
    dd = DisplayData.create_from(sink)
    expected_items = [
        DisplayDataItemMatcher('topic', 'a_topic')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #7
0
  def test_display_data_no_subscription(self):
    source = PubSubSource('a_topic')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('topic', 'a_topic')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
 def test_value_provider_display_data(self):
   class TestOptions(PipelineOptions):
     @classmethod
     def _add_argparse_args(cls, parser):
       parser.add_value_provider_argument(
           '--int_flag',
           type=int,
           help='int_flag description')
       parser.add_value_provider_argument(
           '--str_flag',
           type=str,
           default='hello',
           help='str_flag description')
       parser.add_value_provider_argument(
           '--float_flag',
           type=float,
           help='float_flag description')
   options = TestOptions(['--int_flag', '1'])
   items = DisplayData.create_from_options(options).items
   expected_items = [
       DisplayDataItemMatcher(
           'int_flag',
           '1'),
       DisplayDataItemMatcher(
           'str_flag',
           'RuntimeValueProvider(option: str_flag,'
           ' type: str, default_value: \'hello\')'
       ),
       DisplayDataItemMatcher(
           'float_flag',
           'RuntimeValueProvider(option: float_flag,'
           ' type: float, default_value: None)'
       )
   ]
   hc.assert_that(items, hc.contains_inanyorder(*expected_items))
  def _add_step(self, step_kind, step_label, transform_node, side_tags=()):
    """Creates a Step object and adds it to the cache."""
    # Import here to avoid adding the dependency for local running scenarios.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam.runners.dataflow.internal import apiclient
    step = apiclient.Step(step_kind, self._get_unique_step_name())
    self.job.proto.steps.append(step.proto)
    step.add_property(PropertyNames.USER_NAME, step_label)
    # Cache the node/step association for the main output of the transform node.
    self._cache.cache_output(transform_node, None, step)
    # If side_tags is not () then this is a multi-output transform node and we
    # need to cache the (node, tag, step) for each of the tags used to access
    # the outputs. This is essential because the keys used to search in the
    # cache always contain the tag.
    for tag in side_tags:
      self._cache.cache_output(transform_node, tag, step)

    # Finally, we add the display data items to the pipeline step.
    # If the transform contains no display data then an empty list is added.
    step.add_property(
        PropertyNames.DISPLAY_DATA,
        [item.get_dict() for item in
         DisplayData.create_from(transform_node.transform).items])

    return step
Example #10
0
  def test_display_data(self):
    sink = _PubSubPayloadSink('projects/fakeprj/topics/a_topic')
    dd = DisplayData.create_from(sink)
    expected_items = [
        DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #11
0
 def test_query_only_display_data(self):
   source = beam.io.BigQuerySource(query='my_query')
   dd = DisplayData.create_from(source)
   expected_items = [
       DisplayDataItemMatcher('validation', False),
       DisplayDataItemMatcher('query', 'my_query')]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #12
0
 def test_date_partitioned_table_name(self):
   source = beam.io.BigQuerySource('dataset.table$20030102', validate=True)
   dd = DisplayData.create_from(source)
   expected_items = [
       DisplayDataItemMatcher('validation', True),
       DisplayDataItemMatcher('table', 'dataset.table$20030102')]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #13
0
 def individual_test_per_key_dd(combineFn):
   transform = beam.CombinePerKey(combineFn)
   dd = DisplayData.create_from(transform)
   expected_items = [
       DisplayDataItemMatcher('combine_fn', combineFn.__class__),
       DisplayDataItemMatcher('n', combineFn._n),
       DisplayDataItemMatcher('compare', combineFn._compare.__name__)]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #14
0
 def test_combine_globally_display_data(self):
   transform = beam.CombineGlobally(combine.Smallest(5))
   dd = DisplayData.create_from(transform)
   expected_items = [
       DisplayDataItemMatcher('combine_fn', combine.Smallest),
       DisplayDataItemMatcher('n', 5),
       DisplayDataItemMatcher('compare', 'gt')]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
  def test_display_data_no_subscription(self):
    source = _PubSubSource('projects/fakeprj/topics/a_topic')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'),
        DisplayDataItemMatcher('with_attributes', False),
    ]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #16
0
 def test_basic_combiners_display_data(self):
   transform = beam.CombineGlobally(
       combine.TupleCombineFn(max, combine.MeanCombineFn(), sum))
   dd = DisplayData.create_from(transform)
   expected_items = [
       DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn),
       DisplayDataItemMatcher('combiners',
                              "['max', 'MeanCombineFn', 'sum']")]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #17
0
 def test_single_file_display_data(self):
   file_name, _ = write_data(10)
   fbs = LineSource(file_name)
   dd = DisplayData.create_from(fbs)
   expected_items = [
       DisplayDataItemMatcher('file_pattern', file_name),
       DisplayDataItemMatcher('compression', 'auto')]
   hc.assert_that(dd.items,
                  hc.contains_inanyorder(*expected_items))
Example #18
0
 def test_source_creation_display_data(self):
   file_name = 'dummy_pattern'
   fbs = LineSource(file_name, validate=False)
   dd = DisplayData.create_from(fbs)
   expected_items = [
       DisplayDataItemMatcher('compression', 'auto'),
       DisplayDataItemMatcher('file_pattern', file_name)]
   hc.assert_that(dd.items,
                  hc.contains_inanyorder(*expected_items))
Example #19
0
  def test_display_data(self):
    source = _PubSubPayloadSource('a_topic', 'a_subscription', 'a_label')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('topic', 'a_topic'),
        DisplayDataItemMatcher('subscription', 'a_subscription'),
        DisplayDataItemMatcher('id_label', 'a_label')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #20
0
    def test_display_data(self):
        source = _PubSubPayloadSource('a_topic', 'a_subscription', 'a_label')
        dd = DisplayData.create_from(source)
        expected_items = [
            DisplayDataItemMatcher('topic', 'a_topic'),
            DisplayDataItemMatcher('subscription', 'a_subscription'),
            DisplayDataItemMatcher('id_label', 'a_label')
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
  def test_unicode_type_display_data(self):
    class MyDoFn(beam.DoFn):
      def display_data(self):
        return {'unicode_string': unicode('my string'),
                'unicode_literal_string': u'my literal string'}

    fn = MyDoFn()
    dd = DisplayData.create_from(fn)
    for item in dd.items:
      self.assertEqual(item.type, 'STRING')
Example #22
0
 def test_basic_combiners_display_data(self):
     transform = beam.CombineGlobally(
         combine.TupleCombineFn(max, combine.MeanCombineFn(), sum))
     dd = DisplayData.create_from(transform)
     expected_items = [
         DisplayDataItemMatcher('combine_fn', combine.TupleCombineFn),
         DisplayDataItemMatcher('combiners',
                                "['max', 'MeanCombineFn', 'sum']")
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #23
0
  def test_display_data_item_on_validate_true(self):
    source = beam.io.BigQuerySource(
        'dataset.table', validate=True, use_dataflow_native_source=True)

    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation', True),
        DisplayDataItemMatcher('table', 'dataset.table')
    ]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #24
0
  def test_read_display_data(self):
    file_name = 'some_avro_source'
    read = avroio.ReadFromAvro(file_name, validate=False)
    dd = DisplayData.create_from(read)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #25
0
    def test_display_data_topic(self):
        source = _PubSubSource('projects/fakeprj/topics/a_topic', None,
                               'a_label')
        dd = DisplayData.create_from(source)
        expected_items = [
            DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'),
            DisplayDataItemMatcher('id_label', 'a_label')
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #26
0
    def test_read_display_data(self):
        file_name = 'some_avro_source'
        read = avroio.ReadFromAvro(file_name, validate=False)
        dd = DisplayData.create_from(read)

        # No extra avro parameters for AvroSource.
        expected_items = [
            DisplayDataItemMatcher('compression', 'auto'),
            DisplayDataItemMatcher('file_pattern', file_name)
        ]
        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #27
0
  def test_display_data_subscription(self):
    source = _PubSubSource(
        None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher(
            'subscription', 'projects/fakeprj/subscriptions/a_subscription'),
        DisplayDataItemMatcher('id_label', 'a_label'),
        DisplayDataItemMatcher('with_attributes', False),
    ]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #28
0
    def test_unicode_type_display_data(self):
        class MyDoFn(beam.DoFn):
            def display_data(self):
                return {
                    'unicode_string': unicode('my string'),
                    'unicode_literal_string': u'my literal string'
                }

        fn = MyDoFn()
        dd = DisplayData.create_from(fn)
        for item in dd.items:
            self.assertEqual(item.type, 'STRING')
Example #29
0
  def test_display_data_topic(self):
    source = _PubSubPayloadSource(
        'projects/fakeprj/topics/a_topic',
        None,
        'a_label')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher(
            'topic', 'projects/fakeprj/topics/a_topic'),
        DisplayDataItemMatcher('id_label', 'a_label')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #30
0
  def test_display_data_subscription(self):
    source = _PubSubSource(
        None,
        'projects/fakeprj/subscriptions/a_subscription',
        'a_label')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher(
            'subscription', 'projects/fakeprj/subscriptions/a_subscription'),
        DisplayDataItemMatcher('id_label', 'a_label')]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #31
0
  def test_read_display_data(self):
    file_name = 'some_parquet_source'
    read = \
      ReadFromParquet(
          file_name,
          validate=False)
    dd = DisplayData.create_from(read)

    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #32
0
 def individual_test_per_key_dd(sampleFn, args, kwargs):
   trs = [sampleFn(*args, **kwargs)]
   for transform in trs:
     dd = DisplayData.create_from(transform)
     expected_items = [
         DisplayDataItemMatcher('fn', transform._fn.__name__)]
     if args:
       expected_items.append(
           DisplayDataItemMatcher('args', str(args)))
     if kwargs:
       expected_items.append(
           DisplayDataItemMatcher('kwargs', str(kwargs)))
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
  def test_display_data(self):
    sink = _PubSubSink('projects/fakeprj/topics/a_topic',
                       id_label='id', with_attributes=False,
                       timestamp_attribute='time')
    dd = DisplayData.create_from(sink)
    expected_items = [
        DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'),
        DisplayDataItemMatcher('id_label', 'id'),
        DisplayDataItemMatcher('with_attributes', False),
        DisplayDataItemMatcher('timestamp_attribute', 'time'),
    ]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #34
0
 def test_write_display_data(self):
     file_name = 'some_avro_sink'
     write = avroio.WriteToAvro(file_name, self.SCHEMA)
     dd = DisplayData.create_from(write)
     expected_items = [
         DisplayDataItemMatcher('schema', str(self.SCHEMA)),
         DisplayDataItemMatcher(
             'file_pattern',
             'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d'),
         DisplayDataItemMatcher('codec', 'deflate'),
         DisplayDataItemMatcher('compression', 'uncompressed')
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #35
0
    def test_display_data(self):
        sink = WriteToPubSub('projects/fakeprj/topics/a_topic',
                             id_label='id',
                             timestamp_attribute='time')
        dd = DisplayData.create_from(sink)
        expected_items = [
            DisplayDataItemMatcher('topic', 'projects/fakeprj/topics/a_topic'),
            DisplayDataItemMatcher('id_label', 'id'),
            DisplayDataItemMatcher('with_attributes', True),
            DisplayDataItemMatcher('timestamp_attribute', 'time'),
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #36
0
 def individual_test_per_key_dd(sampleFn, args, kwargs):
   trs = [sampleFn(*args, **kwargs)]
   for transform in trs:
     dd = DisplayData.create_from(transform)
     expected_items = [
         DisplayDataItemMatcher('fn', transform._fn.__name__)]
     if args:
       expected_items.append(
           DisplayDataItemMatcher('args', str(args)))
     if kwargs:
       expected_items.append(
           DisplayDataItemMatcher('kwargs', str(kwargs)))
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #37
0
  def test_drop_if_none(self):
    class MyDoFn(beam.DoFn):
      def display_data(self):
        return {'some_val': DisplayDataItem('something').drop_if_none(),
                'non_val': DisplayDataItem(None).drop_if_none(),
                'def_val': DisplayDataItem(True).drop_if_default(True),
                'nodef_val': DisplayDataItem(True).drop_if_default(False)}

    dd = DisplayData.create_from(MyDoFn())
    expected_items = [DisplayDataItemMatcher('some_val',
                                             'something'),
                      DisplayDataItemMatcher('nodef_val',
                                             True)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #38
0
 def test_sink_display_data(self):
     file_name = 'some_avro_sink'
     sink = AvroSink(file_name, self.SCHEMA, 'null', '.end', 0, None,
                     'application/x-avro')
     dd = DisplayData.create_from(sink)
     expected_items = [
         DisplayDataItemMatcher('schema', str(self.SCHEMA)),
         DisplayDataItemMatcher(
             'file_pattern',
             'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'),
         DisplayDataItemMatcher('codec', 'null'),
         DisplayDataItemMatcher('compression', 'uncompressed')
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #39
0
 def test_write_display_data(self):
   file_name = 'some_parquet_sink'
   write = WriteToParquet(file_name, self.SCHEMA)
   dd = DisplayData.create_from(write)
   expected_items = [
       DisplayDataItemMatcher('codec', 'none'),
       DisplayDataItemMatcher('schema', str(self.SCHEMA)),
       DisplayDataItemMatcher('row_group_buffer_size', str(64 * 1024 * 1024)),
       DisplayDataItemMatcher(
           'file_pattern',
           'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'),
       DisplayDataItemMatcher('compression', 'uncompressed')
   ]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
 def test_file_sink_display_data(self):
   temp_path = os.path.join(self._new_tempdir(), 'display')
   sink = MyFileBasedSink(
       temp_path, file_name_suffix='.output', coder=coders.ToStringCoder())
   dd = DisplayData.create_from(sink)
   expected_items = [
       DisplayDataItemMatcher(
           'compression', 'auto'),
       DisplayDataItemMatcher(
           'file_pattern',
           '{}{}'.format(
               temp_path,
               '-%(shard_num)05d-of-%(num_shards)05d.output'))]
   hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #41
0
 def test_file_sink_display_data(self):
     temp_path = os.path.join(self._new_tempdir(), 'display')
     sink = MyFileBasedSink(temp_path,
                            file_name_suffix='.output',
                            coder=coders.ToStringCoder())
     dd = DisplayData.create_from(sink)
     expected_items = [
         DisplayDataItemMatcher('compression', 'auto'),
         DisplayDataItemMatcher(
             'file_pattern',
             '{}{}'.format(temp_path,
                           '-%(shard_num)05d-of-%(num_shards)05d.output'))
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #42
0
  def test_table_reference_display_data(self):
    source = beam.io.BigQuerySource('dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation', False),
        DisplayDataItemMatcher('table', 'dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

    source = beam.io.BigQuerySource('project:dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation', False),
        DisplayDataItemMatcher('table', 'project:dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

    source = beam.io.BigQuerySource('xyz.com:project:dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation',
                               False),
        DisplayDataItemMatcher('table',
                               'xyz.com:project:dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #43
0
  def test_table_reference_display_data(self):
    source = beam.io.BigQuerySource('dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation', False),
        DisplayDataItemMatcher('table', 'dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

    source = beam.io.BigQuerySource('project:dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation', False),
        DisplayDataItemMatcher('table', 'project:dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

    source = beam.io.BigQuerySource('xyz.com:project:dataset.table')
    dd = DisplayData.create_from(source)
    expected_items = [
        DisplayDataItemMatcher('validation',
                               False),
        DisplayDataItemMatcher('table',
                               'xyz.com:project:dataset.table')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #44
0
    def test_base_cases(self):
        """ Tests basic display data cases (key:value, key:dict)
    It does not test subcomponent inclusion
    """
        class MyDoFn(beam.DoFn):
            def __init__(self, my_display_data=None):
                self.my_display_data = my_display_data

            def process(self, context):
                yield context.element + 1

            def display_data(self):
                return {
                    'static_integer':
                    120,
                    'static_string':
                    'static me!',
                    'complex_url':
                    DisplayDataItem('github.com',
                                    url='http://github.com',
                                    label='The URL'),
                    'python_class':
                    HasDisplayData,
                    'my_dd':
                    self.my_display_data
                }

        now = datetime.now()
        fn = MyDoFn(my_display_data=now)
        dd = DisplayData.create_from(fn)
        nspace = '{}.{}'.format(fn.__module__, fn.__class__.__name__)
        expected_items = [
            DisplayDataItemMatcher(key='complex_url',
                                   value='github.com',
                                   namespace=nspace,
                                   label='The URL'),
            DisplayDataItemMatcher(key='my_dd', value=now, namespace=nspace),
            DisplayDataItemMatcher(key='python_class',
                                   value=HasDisplayData,
                                   namespace=nspace,
                                   shortValue='HasDisplayData'),
            DisplayDataItemMatcher(key='static_integer',
                                   value=120,
                                   namespace=nspace),
            DisplayDataItemMatcher(key='static_string',
                                   value='static me!',
                                   namespace=nspace)
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #45
0
  def test_subcomponent(self):
    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

    dofn = SpecialDoFn()
    pardo = beam.ParDo(dofn)
    dd = DisplayData.create_from(pardo)
    dofn_nspace = '{}.{}'.format(dofn.__module__, dofn.__class__.__name__)
    pardo_nspace = '{}.{}'.format(pardo.__module__, pardo.__class__.__name__)
    expected_items = [
        DisplayDataItemMatcher('dofn_value', 42, dofn_nspace),
        DisplayDataItemMatcher('fn', SpecialDoFn, pardo_nspace)]

    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #46
0
    def test_file_sink_display_data(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          coder=coders.ToStringCoder())
        dd = DisplayData.create_from(sink)
        expected_items = [
            DisplayDataItemMatcher('compression', 'auto'),
            DisplayDataItemMatcher(
                'file_pattern',
                '{}{}'.format(temp_path,
                              '-%(shard_num)05d-of-%(num_shards)05d.foo'))
        ]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #47
0
  def test_source_display_data(self):
    file_name = 'some_avro_source'
    source = \
        _create_avro_source(
            file_name,
            validate=False,
            use_fastavro=self.use_fastavro
        )
    dd = DisplayData.create_from(source)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #48
0
  def test_source_display_data(self):
    file_name = 'some_avro_source'
    source = \
        _create_avro_source(
            file_name,
            validate=False,
            use_fastavro=self.use_fastavro
        )
    dd = DisplayData.create_from(source)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #49
0
 def test_sink_display_data(self):
     file_name = 'some_parquet_sink'
     sink = _create_parquet_sink(file_name, self.SCHEMA, 'none',
                                 1024 * 1024, 1000, False, '.end', 0, None,
                                 'application/x-parquet')
     dd = DisplayData.create_from(sink)
     expected_items = [
         DisplayDataItemMatcher('schema', str(self.SCHEMA)),
         DisplayDataItemMatcher(
             'file_pattern',
             'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d.end'),
         DisplayDataItemMatcher('codec', 'none'),
         DisplayDataItemMatcher('row_group_buffer_size', str(1024 * 1024)),
         DisplayDataItemMatcher('compression', 'uncompressed')
     ]
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #50
0
 def individual_test_per_key_dd(sampleFn, args, kwargs):
   trs = [beam.CombinePerKey(sampleFn(*args, **kwargs)),
          beam.CombineGlobally(sampleFn(*args, **kwargs))]
   for transform in trs:
     dd = DisplayData.create_from(transform)
     expected_items = [
         DisplayDataItemMatcher('fn', sampleFn.fn.__name__),
         DisplayDataItemMatcher('combine_fn',
                                transform.fn.__class__)]
     if len(args) > 0:
       expected_items.append(
           DisplayDataItemMatcher('args', str(args)))
     if len(kwargs) > 0:
       expected_items.append(
           DisplayDataItemMatcher('kwargs', str(kwargs)))
     hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #51
0
    def __init__(self, packages, options, environment_version, pipeline_url):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.pipeline_url = pipeline_url
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(self._get_python_sdk_name())),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(beam_version.__version__))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        _verify_interpreter_version_is_supported(options)
        if self.standard_options.streaming:
            job_type = 'FNAPI_STREAMING'
        else:
            if _use_fnapi(options):
                job_type = 'FNAPI_BATCH'
            else:
                job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # TODO: Use enumerated type instead of strings for job types.
        if job_type.startswith('FNAPI_'):
            runner_harness_override = (get_runner_harness_container_image())
            self.debug_options.experiments = self.debug_options.experiments or []
            if runner_harness_override:
                self.debug_options.experiments.append(
                    'runner_harness_container_image=' +
                    runner_harness_override)
            # Add use_multiple_sdk_containers flag if its not already present. Do not
            # add the flag if 'no_use_multiple_sdk_containers' is present.
            # TODO: Cleanup use_multiple_sdk_containers once we deprecate Python SDK
            # till version 2.4.
            debug_options_experiments = self.debug_options.experiments
            if ('use_multiple_sdk_containers' not in debug_options_experiments
                    and 'no_use_multiple_sdk_containers'
                    not in debug_options_experiments):
                self.debug_options.experiments.append(
                    'use_multiple_sdk_containers')
        # FlexRS
        if self.google_cloud_options.flexrs_goal == 'COST_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_COST_OPTIMIZED)
        elif self.google_cloud_options.flexrs_goal == 'SPEED_OPTIMIZED':
            self.proto.flexResourceSchedulingGoal = (
                dataflow.Environment.FlexResourceSchedulingGoalValueValuesEnum.
                FLEXRS_SPEED_OPTIMIZED)
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))

        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.subnetwork:
            pool.subnetwork = self.worker_options.subnetwork
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            pool.workerHarnessContainerImage = (
                get_default_container_image_for_current_sdk(job_type))
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.items() if v is not None
            }
            options_dict["pipelineUrl"] = pipeline_url
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
Example #52
0
    def __init__(self, packages, options, environment_version):
        self.standard_options = options.view_as(StandardOptions)
        self.google_cloud_options = options.view_as(GoogleCloudOptions)
        self.worker_options = options.view_as(WorkerOptions)
        self.debug_options = options.view_as(DebugOptions)
        self.proto = dataflow.Environment()
        self.proto.clusterManagerApiService = GoogleCloudOptions.COMPUTE_API_SERVICE
        self.proto.dataset = '{}/cloud_dataflow'.format(
            GoogleCloudOptions.BIGQUERY_API_SERVICE)
        self.proto.tempStoragePrefix = (
            self.google_cloud_options.temp_location.replace(
                'gs:/', GoogleCloudOptions.STORAGE_API_SERVICE))
        # User agent information.
        self.proto.userAgent = dataflow.Environment.UserAgentValue()
        self.local = 'localhost' in self.google_cloud_options.dataflow_endpoint

        if self.google_cloud_options.service_account_email:
            self.proto.serviceAccountEmail = (
                self.google_cloud_options.service_account_email)

        sdk_name, version_string = get_sdk_name_and_version()

        self.proto.userAgent.additionalProperties.extend([
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='name', value=to_json_value(sdk_name)),
            dataflow.Environment.UserAgentValue.AdditionalProperty(
                key='version', value=to_json_value(version_string))
        ])
        # Version information.
        self.proto.version = dataflow.Environment.VersionValue()
        if self.standard_options.streaming:
            job_type = 'PYTHON_STREAMING'
        else:
            job_type = 'PYTHON_BATCH'
        self.proto.version.additionalProperties.extend([
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='job_type', value=to_json_value(job_type)),
            dataflow.Environment.VersionValue.AdditionalProperty(
                key='major', value=to_json_value(environment_version))
        ])
        # Experiments
        if self.debug_options.experiments:
            for experiment in self.debug_options.experiments:
                self.proto.experiments.append(experiment)
        # Worker pool(s) information.
        package_descriptors = []
        for package in packages:
            package_descriptors.append(
                dataflow.Package(
                    location='%s/%s' %
                    (self.google_cloud_options.staging_location.replace(
                        'gs:/',
                        GoogleCloudOptions.STORAGE_API_SERVICE), package),
                    name=package))

        pool = dataflow.WorkerPool(
            kind='local' if self.local else 'harness',
            packages=package_descriptors,
            taskrunnerSettings=dataflow.TaskRunnerSettings(
                parallelWorkerSettings=dataflow.WorkerSettings(
                    baseUrl=GoogleCloudOptions.DATAFLOW_ENDPOINT,
                    servicePath=self.google_cloud_options.dataflow_endpoint)))
        pool.autoscalingSettings = dataflow.AutoscalingSettings()
        # Set worker pool options received through command line.
        if self.worker_options.num_workers:
            pool.numWorkers = self.worker_options.num_workers
        if self.worker_options.max_num_workers:
            pool.autoscalingSettings.maxNumWorkers = (
                self.worker_options.max_num_workers)
        if self.worker_options.autoscaling_algorithm:
            values_enum = dataflow.AutoscalingSettings.AlgorithmValueValuesEnum
            pool.autoscalingSettings.algorithm = {
                'NONE': values_enum.AUTOSCALING_ALGORITHM_NONE,
                'THROUGHPUT_BASED': values_enum.AUTOSCALING_ALGORITHM_BASIC,
            }.get(self.worker_options.autoscaling_algorithm)
        if self.worker_options.machine_type:
            pool.machineType = self.worker_options.machine_type
        if self.worker_options.disk_size_gb:
            pool.diskSizeGb = self.worker_options.disk_size_gb
        if self.worker_options.disk_type:
            pool.diskType = self.worker_options.disk_type
        if self.worker_options.zone:
            pool.zone = self.worker_options.zone
        if self.worker_options.network:
            pool.network = self.worker_options.network
        if self.worker_options.worker_harness_container_image:
            pool.workerHarnessContainerImage = (
                self.worker_options.worker_harness_container_image)
        else:
            # Default to using the worker harness container image for the current SDK
            # version.
            pool.workerHarnessContainerImage = (
                'dataflow.gcr.io/v1beta3/python:%s' %
                get_required_container_version())
        if self.worker_options.use_public_ips is not None:
            if self.worker_options.use_public_ips:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PUBLIC)
            else:
                pool.ipConfiguration = (
                    dataflow.WorkerPool.IpConfigurationValueValuesEnum.
                    WORKER_IP_PRIVATE)

        if self.standard_options.streaming:
            # Use separate data disk for streaming.
            disk = dataflow.Disk()
            if self.local:
                disk.diskType = 'local'
            # TODO(ccy): allow customization of disk.
            pool.dataDisks.append(disk)
        self.proto.workerPools.append(pool)

        sdk_pipeline_options = options.get_all_options()
        if sdk_pipeline_options:
            self.proto.sdkPipelineOptions = (
                dataflow.Environment.SdkPipelineOptionsValue())

            options_dict = {
                k: v
                for k, v in sdk_pipeline_options.iteritems() if v is not None
            }
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='options',
                                   value=to_json_value(options_dict)))

            dd = DisplayData.create_from_options(options)
            items = [item.get_dict() for item in dd.items]
            self.proto.sdkPipelineOptions.additionalProperties.append(
                dataflow.Environment.SdkPipelineOptionsValue.
                AdditionalProperty(key='display_data',
                                   value=to_json_value(items)))
Example #53
0
 def test_perkey_display_data(self):
     transform = beam.ApproximateQuantiles.PerKey(3, key=len, reverse=True)
     data = DisplayData.create_from(transform)
     expected_items = self._display_data_matcher(transform)
     hc.assert_that(data.items, hc.contains_inanyorder(*expected_items))
 def test_display_data(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         dd = DisplayData.create_from(options)
         hc.assert_that(dd.items,
                        hc.contains_inanyorder(*case['display_data']))
Example #55
0
    def test_display_data(self):
        sink = PubSubSink('a_topic')
        dd = DisplayData.create_from(sink)
        expected_items = [DisplayDataItemMatcher('topic', 'a_topic')]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))
Example #56
0
    def test_display_data_no_subscription(self):
        source = PubSubSource('a_topic')
        dd = DisplayData.create_from(source)
        expected_items = [DisplayDataItemMatcher('topic', 'a_topic')]

        hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))