Beispiel #1
0
 def test_no_group_by_key_directly_after_bigquery(self):
     remote_runner = DataflowRunner()
     p = Pipeline(remote_runner,
                  options=PipelineOptions([
                      '--dataflow_endpoint=ignored', '--job_name=test-job',
                      '--project=test-project',
                      '--staging_location=ignored',
                      '--temp_location=/dev/null', '--no_auth=True'
                  ]))
     rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable'))
     with self.assertRaises(ValueError,
                            msg=('Coder for the GroupByKey operation'
                                 '"GroupByKey" is not a key-value coder: '
                                 'RowAsDictJsonCoder')):
         unused_invalid = rows | beam.GroupByKey()
Beispiel #2
0
    def test_streaming_create_translation(self):
        remote_runner = DataflowRunner()
        self.default_properties.append("--streaming")
        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        job_dict = json.loads(str(remote_runner.job))
        self.assertEqual(len(job_dict[u'steps']), 3)

        self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
        self.assertEqual(
            job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
            '_starting_signal/')
        self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
        self.assertEqual(job_dict[u'steps'][2][u'kind'], u'ParallelDo')
Beispiel #3
0
    def test_streaming_engine_flag_adds_windmill_experiments(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--streaming')
        self.default_properties.append('--enable_streaming_engine')
        self.default_properties.append('--experiment=some_other_experiment')

        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

        experiments_for_job = (
            remote_runner.job.options.view_as(DebugOptions).experiments)
        self.assertIn('enable_streaming_engine', experiments_for_job)
        self.assertIn('enable_windmill_service', experiments_for_job)
        self.assertIn('some_other_experiment', experiments_for_job)
Beispiel #4
0
 def test_environment_override_translation(self):
   self.default_properties.append('--experiments=beam_fn_api')
   self.default_properties.append('--worker_harness_container_image=FOO')
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions(self.default_properties))
   (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
    | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
    | ptransform.GroupByKey())
   p.run()
   self.assertEqual(
       list(remote_runner.proto_pipeline.components.environments.values()),
       [beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image='FOO').SerializeToString())])
  def test_unsupported_fnapi_features(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=beam_fn_api')
    self.default_properties.append('--experiment=use_runner_v2')

    with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'):
      with Pipeline(remote_runner,
                    options=PipelineOptions(self.default_properties)) as p:
        # pylint: disable=expression-not-assigned
        p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn())

    with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'):
      with Pipeline(remote_runner,
                    options=PipelineOptions(self.default_properties)) as p:
        # pylint: disable=expression-not-assigned
        p | beam.Create([]) | beam.WindowInto(CustomWindowTypeWindowFn())
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Beispiel #7
0
    def test_unsupported_combinefn_fail(self):
        class CombinerWithNonDefaultSetupTeardown(combiners.CountCombineFn):
            def setup(self, *args, **kwargs):
                pass

            def teardown(self, *args, **kwargs):
                pass

        runner = DataflowRunner()
        with self.assertRaisesRegex(
                ValueError, 'CombineFn.setup and CombineFn.'
                'teardown are not supported'):
            with beam.Pipeline(runner=runner,
                               options=PipelineOptions(
                                   self.default_properties)) as p:
                _ = (p | beam.Create([1])
                     | beam.CombineGlobally(
                         CombinerWithNonDefaultSetupTeardown()))
Beispiel #8
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on
        # this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
Beispiel #9
0
    def test_gbk_then_flatten_input_visitor(self):
        p = TestPipeline(runner=DataflowRunner(),
                         options=PipelineOptions(self.default_properties))
        none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
        none_int_pc = p | 'c2' >> beam.Create({None: 3})
        flat = (none_str_pc, none_int_pc) | beam.Flatten()
        _ = flat | beam.GroupByKey()

        # This may change if type inference changes, but we assert it here
        # to make sure the check below is not vacuous.
        self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)

        p.visit(DataflowRunner.group_by_key_input_visitor())
        p.visit(DataflowRunner.flatten_input_visitor())

        # The dataflow runner requires gbk input to be tuples *and* flatten
        # inputs to be equal to their outputs. Assert both hold.
        self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
        self.assertEqual(flat.element_type, none_str_pc.element_type)
        self.assertEqual(flat.element_type, none_int_pc.element_type)
Beispiel #10
0
  def _run_group_into_batches_and_get_step_properties(
      self, with_sharded_key, additional_properties):
    self.default_properties.append('--streaming')
    for property in additional_properties:
      self.default_properties.append(property)

    runner = DataflowRunner()
    with beam.Pipeline(runner=runner,
                       options=PipelineOptions(self.default_properties)) as p:
      # pylint: disable=expression-not-assigned
      input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)])
      if with_sharded_key:
        (
            input | beam.GroupIntoBatches.WithShardedKey(2)
            | beam.Map(lambda key_values: (key_values[0].key, key_values[1])))
        step_name = (
            u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)')
      else:
        input | beam.GroupIntoBatches(2)
        step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)'

    return self._find_step(runner.job, step_name)['properties']
Beispiel #11
0
    def test_resource_hints_translation(self, memory_hint):
        runner = DataflowRunner()
        self.default_properties.append('--resource_hint=accelerator=some_gpu')
        self.default_properties.append(f'--resource_hint={memory_hint}=20GB')
        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.Create([1])
             | 'MapWithHints' >> beam.Map(lambda x: x + 1).with_resource_hints(
                 min_ram='10GB',
                 accelerator=
                 'type:nvidia-tesla-k80;count:1;install-nvidia-drivers'))

        step = self._find_step(runner.job, 'MapWithHints')
        self.assertEqual(
            step['properties']['resource_hints'],
            {
                'beam:resources:min_ram_bytes:v1': '20000000000',
                'beam:resources:accelerator:v1': \
                    'type%3Anvidia-tesla-k80%3Bcount%3A1%3Binstall-nvidia-drivers'
            })
Beispiel #12
0
  def _test_pack_combiners(self, pipeline_options, expect_packed):
    runner = DataflowRunner()

    with beam.Pipeline(runner=runner, options=pipeline_options) as p:
      data = p | beam.Create([10, 20, 30])
      _ = data | 'PackableMin' >> beam.CombineGlobally(min)
      _ = data | 'PackableMax' >> beam.CombineGlobally(max)

    unpacked_minimum_step_name = 'PackableMin/CombinePerKey/Combine'
    unpacked_maximum_step_name = 'PackableMax/CombinePerKey/Combine'
    packed_step_name = (
        'Packed[PackableMin/CombinePerKey, PackableMax/CombinePerKey]/Pack/'
        'CombinePerKey(SingleInputTupleCombineFn)/Combine')
    job_dict = json.loads(str(runner.job))
    step_names = set(s[u'properties'][u'user_name'] for s in job_dict[u'steps'])
    if expect_packed:
      self.assertNotIn(unpacked_minimum_step_name, step_names)
      self.assertNotIn(unpacked_maximum_step_name, step_names)
      self.assertIn(packed_step_name, step_names)
    else:
      self.assertIn(unpacked_minimum_step_name, step_names)
      self.assertIn(unpacked_maximum_step_name, step_names)
      self.assertNotIn(packed_step_name, step_names)
Beispiel #13
0
    def test_gbk_translation(self):
        runner = DataflowRunner()
        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            p | beam.Create([(1, 2)]) | beam.GroupByKey()

        expected_output_info = [{
            "encoding": {
                "@type": "kind:windowed_value",
                "component_encodings": [{
                    "@type": "kind:pair",
                    "component_encodings": [{
                        "@type": "kind:varint"
                    },
                    {
                        "@type": "kind:stream",
                        "component_encodings": [{
                            "@type": "kind:varint"
                        }],
                        "is_stream_like": True
                    }],
                    "is_pair_like": True
                }, {
                    "@type": "kind:global_window"
                }],
                "is_wrapper": True
            },
            "output_name": "out",
            "user_name": "GroupByKey.out"
        }]  # yapf: disable

        gbk_step = self._find_step(runner.job, u'GroupByKey')
        self.assertEqual(gbk_step[u'kind'], u'GroupByKey')
        self.assertEqual(gbk_step[u'properties']['output_info'],
                         expected_output_info)
Beispiel #14
0
 def test_get_default_gcp_region_ignores_error(self, patched_environ,
                                               patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertIsNone(result)
Beispiel #15
0
 def test_get_default_gcp_region_from_gcloud(self, patched_environ,
                                             patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertEqual(result, 'some-region2')
Beispiel #16
0
 def test_get_default_gcp_region_no_default_returns_none(
         self, patched_environ, patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertIsNone(result)
    windowed_avg = (windowed_data
     | "avg1" >> beam.CombinePerKey(beam.combiners.MeanCombineFn())
	)


    class PrintWindowResults(beam.DoFn):
        def process(self, element, window=beam.DoFn.WindowParam):
            new_element = element
            yield new_element



    ( windowed_sum
            | "sum4" >> beam.ParDo(PrintWindowResults())
            | "sum5" >> beam.Map(lambda st: '{{"id": {}, "total_steps": {}}}'.format(st[0],st[1]))
            | "sum6" >> beam.Map(lambda z: bytes(z, "utf-8"))
            | "sum7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out")
    )


    ( windowed_avg  
        | "avg4" >> beam.ParDo(PrintWindowResults())
        | "avg5" >> beam.Map(lambda av: '{{"id": {}, "average_steps": {}}}'.format(av[0],av[1]))
        | "avg6" >> beam.Map(lambda po: bytes(po, "utf-8"))
        | "avg7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out")
    )

    DataflowRunner().run_pipeline(pipeline, options=options)

Beispiel #18
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        # TODO: Should not subclass ParDo. Switch to PTransform as soon as
        # composite transforms support display data.
        class SpecialParDo(beam.ParDo):
            def __init__(self, fn, now):
                super(SpecialParDo, self).__init__(fn)
                self.fn = fn
                self.now = now

            # Make this a list to be accessible within closure
            def display_data(self):
                return {
                    'asubcomponent': self.fn,
                    'a_class': SpecialParDo,
                    'a_time': self.now
                }

        class SpecialDoFn(beam.DoFn):
            def display_data(self):
                return {'dofn_value': 42}

            def process(self):
                pass

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        p.run()
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        disp_data = sorted(disp_data, key=lambda x: x['namespace'] + x['key'])
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        expected_data = sorted(expected_data,
                               key=lambda x: x['namespace'] + x['key'])
        self.assertEqual(len(disp_data), 3)
        self.assertEqual(disp_data, expected_data)