def test_group_by_key_input_visitor_for_non_gbk_transforms(self):
   p = TestPipeline()
   pcoll = PCollection(p)
   for transform in [beam.Flatten(), beam.Map(lambda x: x)]:
     pcoll.element_type = typehints.Any
     DataflowRunner.group_by_key_input_visitor().visit_transform(
         AppliedPTransform(None, transform, "label", [pcoll]))
     self.assertEqual(pcoll.element_type, typehints.Any)
 def test_serialize_windowing_strategy(self):
   # This just tests the basic path; more complete tests
   # are in window_test.py.
   strategy = Windowing(window.FixedWindows(10))
   self.assertEqual(
       strategy,
       DataflowRunner.deserialize_windowing_strategy(
           DataflowRunner.serialize_windowing_strategy(strategy)))
  def test_remote_runner_translation(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
     | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
     | ptransform.GroupByKey())
    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = typehints.TupleSequenceConstraint
     pcoll2.element_type = typehints.Set
     err_msg = "Input to GroupByKey must be of Tuple or Any type"
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[0]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
 def test_group_by_key_input_visitor_with_invalid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = str
     pcoll2.element_type = typehints.Set
     err_msg = (
         r"Input to 'label' must be compatible with KV\[Any, Any\]. "
         "Found .*")
     for pcoll in [pcoll1, pcoll2]:
       with self.assertRaisesRegexp(ValueError, err_msg):
         DataflowRunner.group_by_key_input_visitor().visit_transform(
             AppliedPTransform(None, transform, "label", [pcoll]))
 def test_group_by_key_input_visitor_with_valid_inputs(self):
   p = TestPipeline()
   pcoll1 = PCollection(p)
   pcoll2 = PCollection(p)
   pcoll3 = PCollection(p)
   for transform in [_GroupByKeyOnly(), beam.GroupByKey()]:
     pcoll1.element_type = None
     pcoll2.element_type = typehints.Any
     pcoll3.element_type = typehints.KV[typehints.Any, typehints.Any]
     for pcoll in [pcoll1, pcoll2, pcoll3]:
       DataflowRunner.group_by_key_input_visitor().visit_transform(
           AppliedPTransform(None, transform, "label", [pcoll]))
       self.assertEqual(pcoll.element_type,
                        typehints.KV[typehints.Any, typehints.Any])
  def test_side_input_visitor(self):
    p = TestPipeline()
    pc = p | beam.Create([])

    transform = beam.Map(
        lambda x, y, z: (x, y, z),
        beam.pvalue.AsSingleton(pc),
        beam.pvalue.AsMultiMap(pc))
    applied_transform = AppliedPTransform(None, transform, "label", [pc])
    DataflowRunner.side_input_visitor().visit_transform(applied_transform)
    self.assertEqual(2, len(applied_transform.side_inputs))
    for side_input in applied_transform.side_inputs:
      self.assertEqual(
          dataflow_runner._DataflowSideInput.DATAFLOW_MULTIMAP_URN,
          side_input._side_input_data().access_pattern)
  def _test_flatten_input_visitor(self, input_type, output_type, num_inputs):
    p = TestPipeline()
    inputs = []
    for _ in range(num_inputs):
      input_pcoll = PCollection(p)
      input_pcoll.element_type = input_type
      inputs.append(input_pcoll)
    output_pcoll = PCollection(p)
    output_pcoll.element_type = output_type

    flatten = AppliedPTransform(None, beam.Flatten(), "label", inputs)
    flatten.add_output(output_pcoll, None)
    DataflowRunner.flatten_input_visitor().visit_transform(flatten)
    for _ in range(num_inputs):
      self.assertEqual(inputs[0].element_type, output_type)
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
  def test_gbk_then_flatten_input_visitor(self):
    p = TestPipeline(
        runner=DataflowRunner(),
        options=PipelineOptions(self.default_properties))
    none_str_pc = p | 'c1' >> beam.Create({None: 'a'})
    none_int_pc = p | 'c2' >> beam.Create({None: 3})
    flat = (none_str_pc, none_int_pc) | beam.Flatten()
    _ = flat | beam.GroupByKey()

    # This may change if type inference changes, but we assert it here
    # to make sure the check below is not vacuous.
    self.assertNotIsInstance(flat.element_type, typehints.TupleConstraint)

    p.visit(DataflowRunner.group_by_key_input_visitor())
    p.visit(DataflowRunner.flatten_input_visitor())

    # The dataflow runner requires gbk input to be tuples *and* flatten
    # inputs to be equal to their outputs. Assert both hold.
    self.assertIsInstance(flat.element_type, typehints.TupleConstraint)
    self.assertEqual(flat.element_type, none_str_pc.element_type)
    self.assertEqual(flat.element_type, none_int_pc.element_type)
Beispiel #12
0
    def test_dataflow_worker_jar_flag_non_fnapi_noop(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=some_other_experiment')
        self.default_properties.append('--dataflow_worker_jar=test.jar')

        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

        experiments_for_job = (
            remote_runner.job.options.view_as(DebugOptions).experiments)
        self.assertIn('some_other_experiment', experiments_for_job)
        self.assertNotIn('use_staged_dataflow_worker_jar', experiments_for_job)
Beispiel #13
0
  def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=beam_fn_api')
    self.default_properties.append('--dataflow_worker_jar=test.jar')

    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    p.run()

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('beam_fn_api', experiments_for_job)
    self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
Beispiel #14
0
 def test_biqquery_read_fn_api_fail(self):
   remote_runner = DataflowRunner()
   for flag in ['beam_fn_api', 'use_unified_worker', 'use_runner_v2']:
     self.default_properties.append("--experiments=%s" % flag)
     with self.assertRaisesRegex(
         ValueError,
         'The Read.BigQuerySource.*is not supported.*'
         'apache_beam.io.gcp.bigquery.ReadFromBigQuery.*'):
       with Pipeline(remote_runner,
                     PipelineOptions(self.default_properties)) as p:
         _ = p | beam.io.Read(
             beam.io.BigQuerySource(
                 'some.table', use_dataflow_native_source=True))
Beispiel #15
0
 def test_no_group_by_key_directly_after_bigquery(self):
   remote_runner = DataflowRunner()
   with self.assertRaises(ValueError,
                          msg=('Coder for the GroupByKey operation'
                               '"GroupByKey" is not a key-value coder: '
                               'RowAsDictJsonCoder')):
     with beam.Pipeline(runner=remote_runner,
                        options=PipelineOptions(self.default_properties)) as p:
       # pylint: disable=expression-not-assigned
       p | beam.io.Read(
           beam.io.BigQuerySource(
               'dataset.faketable',
               use_dataflow_native_source=True)) | beam.GroupByKey()
Beispiel #16
0
    def test_read_pubsub_translation(self):
        runner = DataflowRunner()

        self.default_properties.append("--streaming")

        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            p | beam.io.ReadFromPubSub(topic='projects/project/topics/topic')

        self.expect_correct_override(runner.job, u'ReadFromPubSub/Read',
                                     u'ParallelRead')
Beispiel #17
0
  def test_combine_values_translation(self):
    runner = DataflowRunner()

    with beam.Pipeline(runner=runner,
                       options=PipelineOptions(self.default_properties)) as p:
      (  # pylint: disable=expression-not-assigned
          p
          | beam.Create([('a', [1, 2]), ('b', [3, 4])])
          | beam.CombineValues(lambda v, _: sum(v)))

    job_dict = json.loads(str(runner.job))
    self.assertIn(
        u'CombineValues', set(step[u'kind'] for step in job_dict[u'steps']))
Beispiel #18
0
    def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(
            self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=use_avro')

        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

        debug_options = remote_runner.job.options.view_as(DebugOptions)

        self.assertFalse(debug_options.lookup_experiment(
            'use_fastavro', False))
Beispiel #19
0
    def test_read_bigquery_translation(self):
        runner = DataflowRunner()

        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            p | beam.io.Read(
                beam.io.BigQuerySource('some.table',
                                       coder=BytesCoder(),
                                       use_dataflow_native_source=True))

        self.expect_correct_override(runner.job, u'Read', u'ParallelRead')
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    p.run()
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[1]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
Beispiel #21
0
    def test_write_bigquery_failed_translation(self):
        """Tests that WriteToBigQuery cannot have any consumers if replaced."""
        runner = DataflowRunner()

        self.default_properties.append('--experiments=use_legacy_bq_sink')
        with self.assertRaises(Exception):
            with beam.Pipeline(runner=runner,
                               options=PipelineOptions(
                                   self.default_properties)) as p:
                # pylint: disable=expression-not-assigned
                out = p | beam.Create(
                    [1]) | beam.io.WriteToBigQuery('some.table')
                out['destination_file_pairs'] | 'MyTransform' >> beam.Map(
                    lambda _: _)
  def test_streaming_engine_flag_adds_windmill_experiments(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--streaming')
    self.default_properties.append('--enable_streaming_engine')
    self.default_properties.append('--experiment=some_other_experiment')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('enable_streaming_engine', experiments_for_job)
    self.assertIn('enable_windmill_service', experiments_for_job)
    self.assertIn('some_other_experiment', experiments_for_job)
Beispiel #23
0
  def test_write_bigquery_translation(self):
    runner = DataflowRunner()

    self.default_properties.append('--experiments=use_legacy_bq_sink')
    with beam.Pipeline(runner=runner,
                       options=PipelineOptions(self.default_properties)) as p:
      # pylint: disable=expression-not-assigned
      p | beam.Create([1]) | beam.io.WriteToBigQuery('some.table')

    job_dict = json.loads(str(runner.job))

    expected_step = {
        "kind": "ParallelWrite",
        "name": "s2",
        "properties": {
            "create_disposition": "CREATE_IF_NEEDED",
            "dataset": "some",
            "display_data": [],
            "encoding": {
                "@type": "kind:windowed_value",
                "component_encodings": [{
                    "component_encodings": [],
                    "pipeline_proto_coder_id": "ref_Coder_RowAsDictJsonCoder_4"
                }, {
                    "@type": "kind:global_window"
                }],
                "is_wrapper": True
            },
            "format": "bigquery",
            "parallel_input": {
                "@type": "OutputReference",
                "output_name": "out",
                "step_name": "s1"
            },
            "table": "table",
            "user_name": "WriteToBigQuery/Write/NativeWrite",
            "write_disposition": "WRITE_APPEND"
        }
    }
    job_dict = json.loads(str(runner.job))
    write_step = [
        s for s in job_dict[u'steps']
        if s[u'properties'][u'user_name'].startswith('WriteToBigQuery')
    ][0]

    # Delete the @type field because in this case it is a hash which may change
    # depending on the pickling version.
    step_encoding = write_step[u'properties'][u'encoding']
    del step_encoding[u'component_encodings'][0][u'@type']
    self.assertEqual(expected_step, write_step)
Beispiel #24
0
    def test_streaming_create_translation(self):
        remote_runner = DataflowRunner()
        self.default_properties.append("--streaming")
        p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
        p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        p.run()
        job_dict = json.loads(str(remote_runner.job))
        self.assertEqual(len(job_dict[u'steps']), 2)

        self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
        self.assertEqual(
            job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
            '_starting_signal/')
        self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Beispiel #25
0
 def test_no_group_by_key_directly_after_bigquery(self):
     remote_runner = DataflowRunner()
     p = Pipeline(remote_runner,
                  options=PipelineOptions([
                      '--dataflow_endpoint=ignored', '--job_name=test-job',
                      '--project=test-project',
                      '--staging_location=ignored',
                      '--temp_location=/dev/null', '--no_auth'
                  ]))
     rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable'))
     with self.assertRaises(ValueError,
                            msg=('Coder for the GroupByKey operation'
                                 '"GroupByKey" is not a key-value coder: '
                                 'RowAsDictJsonCoder')):
         unused_invalid = rows | beam.GroupByKey()
  def test_unsupported_fnapi_features(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=beam_fn_api')
    self.default_properties.append('--experiment=use_runner_v2')

    with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'):
      with Pipeline(remote_runner,
                    options=PipelineOptions(self.default_properties)) as p:
        # pylint: disable=expression-not-assigned
        p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn())

    with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'):
      with Pipeline(remote_runner,
                    options=PipelineOptions(self.default_properties)) as p:
        # pylint: disable=expression-not-assigned
        p | beam.Create([]) | beam.WindowInto(CustomWindowTypeWindowFn())
Beispiel #27
0
 def test_environment_override_translation(self):
   self.default_properties.append('--experiments=beam_fn_api')
   self.default_properties.append('--worker_harness_container_image=FOO')
   remote_runner = DataflowRunner()
   p = Pipeline(remote_runner,
                options=PipelineOptions(self.default_properties))
   (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
    | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
    | ptransform.GroupByKey())
   p.run()
   self.assertEqual(
       list(remote_runner.proto_pipeline.components.environments.values()),
       [beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image='FOO').SerializeToString())])
Beispiel #28
0
    def test_unsupported_combinefn_fail(self):
        class CombinerWithNonDefaultSetupTeardown(combiners.CountCombineFn):
            def setup(self, *args, **kwargs):
                pass

            def teardown(self, *args, **kwargs):
                pass

        runner = DataflowRunner()
        with self.assertRaisesRegex(
                ValueError, 'CombineFn.setup and CombineFn.'
                'teardown are not supported'):
            with beam.Pipeline(runner=runner,
                               options=PipelineOptions(
                                   self.default_properties)) as p:
                _ = (p | beam.Create([1])
                     | beam.CombineGlobally(
                         CombinerWithNonDefaultSetupTeardown()))
Beispiel #29
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(https://github.com/apache/beam/issues/18012) Enable runner API on
        # this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
Beispiel #30
0
  def _run_group_into_batches_and_get_step_properties(
      self, with_sharded_key, additional_properties):
    self.default_properties.append('--streaming')
    for property in additional_properties:
      self.default_properties.append(property)

    runner = DataflowRunner()
    with beam.Pipeline(runner=runner,
                       options=PipelineOptions(self.default_properties)) as p:
      # pylint: disable=expression-not-assigned
      input = p | beam.Create([('a', 1), ('a', 1), ('b', 3), ('b', 4)])
      if with_sharded_key:
        (
            input | beam.GroupIntoBatches.WithShardedKey(2)
            | beam.Map(lambda key_values: (key_values[0].key, key_values[1])))
        step_name = (
            u'WithShardedKey/GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)')
      else:
        input | beam.GroupIntoBatches(2)
        step_name = u'GroupIntoBatches/ParDo(_GroupIntoBatchesDoFn)'

    return self._find_step(runner.job, step_name)['properties']
Beispiel #31
0
  def _test_pack_combiners(self, pipeline_options, expect_packed):
    runner = DataflowRunner()

    with beam.Pipeline(runner=runner, options=pipeline_options) as p:
      data = p | beam.Create([10, 20, 30])
      _ = data | 'PackableMin' >> beam.CombineGlobally(min)
      _ = data | 'PackableMax' >> beam.CombineGlobally(max)

    unpacked_minimum_step_name = 'PackableMin/CombinePerKey/Combine'
    unpacked_maximum_step_name = 'PackableMax/CombinePerKey/Combine'
    packed_step_name = (
        'Packed[PackableMin/CombinePerKey, PackableMax/CombinePerKey]/Pack/'
        'CombinePerKey(SingleInputTupleCombineFn)/Combine')
    job_dict = json.loads(str(runner.job))
    step_names = set(s[u'properties'][u'user_name'] for s in job_dict[u'steps'])
    if expect_packed:
      self.assertNotIn(unpacked_minimum_step_name, step_names)
      self.assertNotIn(unpacked_maximum_step_name, step_names)
      self.assertIn(packed_step_name, step_names)
    else:
      self.assertIn(unpacked_minimum_step_name, step_names)
      self.assertIn(unpacked_maximum_step_name, step_names)
      self.assertNotIn(packed_step_name, step_names)
Beispiel #32
0
    def test_resource_hints_translation(self, memory_hint):
        runner = DataflowRunner()
        self.default_properties.append('--resource_hint=accelerator=some_gpu')
        self.default_properties.append(f'--resource_hint={memory_hint}=20GB')
        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            (p
             | beam.Create([1])
             | 'MapWithHints' >> beam.Map(lambda x: x + 1).with_resource_hints(
                 min_ram='10GB',
                 accelerator=
                 'type:nvidia-tesla-k80;count:1;install-nvidia-drivers'))

        step = self._find_step(runner.job, 'MapWithHints')
        self.assertEqual(
            step['properties']['resource_hints'],
            {
                'beam:resources:min_ram_bytes:v1': '20000000000',
                'beam:resources:accelerator:v1': \
                    'type%3Anvidia-tesla-k80%3Bcount%3A1%3Binstall-nvidia-drivers'
            })
Beispiel #33
0
    def test_gbk_translation(self):
        runner = DataflowRunner()
        with beam.Pipeline(runner=runner,
                           options=PipelineOptions(
                               self.default_properties)) as p:
            # pylint: disable=expression-not-assigned
            p | beam.Create([(1, 2)]) | beam.GroupByKey()

        expected_output_info = [{
            "encoding": {
                "@type": "kind:windowed_value",
                "component_encodings": [{
                    "@type": "kind:pair",
                    "component_encodings": [{
                        "@type": "kind:varint"
                    },
                    {
                        "@type": "kind:stream",
                        "component_encodings": [{
                            "@type": "kind:varint"
                        }],
                        "is_stream_like": True
                    }],
                    "is_pair_like": True
                }, {
                    "@type": "kind:global_window"
                }],
                "is_wrapper": True
            },
            "output_name": "out",
            "user_name": "GroupByKey.out"
        }]  # yapf: disable

        gbk_step = self._find_step(runner.job, u'GroupByKey')
        self.assertEqual(gbk_step[u'kind'], u'GroupByKey')
        self.assertEqual(gbk_step[u'properties']['output_info'],
                         expected_output_info)
Beispiel #34
0
    def run_ParDo(self, transform_node):
        transform = transform_node.transform
        output = transform_node.outputs[None]
        element_coder = self._get_coder(output)
        map_task_index, producer_index, output_index = self.outputs[
            transform_node.inputs[0]]

        # If any of this ParDo's side inputs depend on outputs from this map_task,
        # we can't continue growing this map task.
        def is_reachable(leaf, root):
            if leaf == root:
                return True
            else:
                return any(
                    is_reachable(x, root) for x in self.dependencies[leaf])

        if any(
                is_reachable(self.outputs[side_input.pvalue][0],
                             map_task_index)
                for side_input in transform_node.side_inputs):
            # Start a new map tasks.
            input_element_coder = self._get_coder(transform_node.inputs[0])

            output_buffer = OutputBuffer(input_element_coder)

            fusion_break_write = operation_specs.WorkerInMemoryWrite(
                output_buffer=output_buffer,
                write_windowed_values=True,
                input=(producer_index, output_index),
                output_coders=[input_element_coder])
            self.map_tasks[map_task_index].append(
                (transform_node.full_label + '/Write', fusion_break_write))

            original_map_task_index = map_task_index
            map_task_index, producer_index, output_index = len(
                self.map_tasks), 0, 0

            fusion_break_read = operation_specs.WorkerRead(
                output_buffer.source_bundle(),
                output_coders=[input_element_coder])
            self.map_tasks.append([(transform_node.full_label + '/Read',
                                    fusion_break_read)])

            self.dependencies[map_task_index].add(original_map_task_index)

        def create_side_read(side_input):
            label = self.side_input_labels[side_input]
            output_buffer = self.run_side_write(
                side_input.pvalue,
                '%s/%s' % (transform_node.full_label, label))
            return operation_specs.WorkerSideInputSource(
                output_buffer.source(), label)

        do_op = operation_specs.WorkerDoFn(  #
            serialized_fn=pickler.dumps(
                DataflowRunner._pardo_fn_data(
                    transform_node,
                    lambda side_input: self.side_input_labels[side_input])),
            output_tags=[PropertyNames.OUT] + [
                '%s_%s' % (PropertyNames.OUT, tag)
                for tag in transform.output_tags
            ],
            # Same assumption that DataflowRunner has about coders being compatible
            # across outputs.
            output_coders=[element_coder] * (len(transform.output_tags) + 1),
            input=(producer_index, output_index),
            side_inputs=[
                create_side_read(side_input)
                for side_input in transform_node.side_inputs
            ])

        producer_index = len(self.map_tasks[map_task_index])
        self.outputs[transform_node.outputs[None]] = (map_task_index,
                                                      producer_index, 0)
        for ix, tag in enumerate(transform.output_tags):
            self.outputs[transform_node.
                         outputs[tag]] = map_task_index, producer_index, ix + 1
        self.map_tasks[map_task_index].append(
            (transform_node.full_label, do_op))

        for side_input in transform_node.side_inputs:
            self.dependencies[map_task_index].add(
                self.outputs[side_input.pvalue][0])
  def run_ParDo(self, transform_node):
    transform = transform_node.transform
    output = transform_node.outputs[None]
    element_coder = self._get_coder(output)
    map_task_index, producer_index, output_index = self.outputs[
        transform_node.inputs[0]]

    # If any of this ParDo's side inputs depend on outputs from this map_task,
    # we can't continue growing this map task.
    def is_reachable(leaf, root):
      if leaf == root:
        return True
      else:
        return any(is_reachable(x, root) for x in self.dependencies[leaf])

    if any(is_reachable(self.outputs[side_input.pvalue][0], map_task_index)
           for side_input in transform_node.side_inputs):
      # Start a new map tasks.
      input_element_coder = self._get_coder(transform_node.inputs[0])

      output_buffer = OutputBuffer(input_element_coder)

      fusion_break_write = operation_specs.WorkerInMemoryWrite(
          output_buffer=output_buffer,
          write_windowed_values=True,
          input=(producer_index, output_index),
          output_coders=[input_element_coder])
      self.map_tasks[map_task_index].append(
          (transform_node.full_label + '/Write', fusion_break_write))

      original_map_task_index = map_task_index
      map_task_index, producer_index, output_index = len(self.map_tasks), 0, 0

      fusion_break_read = operation_specs.WorkerRead(
          output_buffer.source_bundle(),
          output_coders=[input_element_coder])
      self.map_tasks.append(
          [(transform_node.full_label + '/Read', fusion_break_read)])

      self.dependencies[map_task_index].add(original_map_task_index)

    def create_side_read(side_input):
      label = self.side_input_labels[side_input]
      output_buffer = self.run_side_write(
          side_input.pvalue, '%s/%s' % (transform_node.full_label, label))
      return operation_specs.WorkerSideInputSource(
          output_buffer.source(), label)

    do_op = operation_specs.WorkerDoFn(  #
        serialized_fn=pickler.dumps(DataflowRunner._pardo_fn_data(
            transform_node,
            lambda side_input: self.side_input_labels[side_input])),
        output_tags=[PropertyNames.OUT] + ['%s_%s' % (PropertyNames.OUT, tag)
                                           for tag in transform.output_tags
                                          ],
        # Same assumption that DataflowRunner has about coders being compatible
        # across outputs.
        output_coders=[element_coder] * (len(transform.output_tags) + 1),
        input=(producer_index, output_index),
        side_inputs=[create_side_read(side_input)
                     for side_input in transform_node.side_inputs])

    producer_index = len(self.map_tasks[map_task_index])
    self.outputs[transform_node.outputs[None]] = (
        map_task_index, producer_index, 0)
    for ix, tag in enumerate(transform.output_tags):
      self.outputs[transform_node.outputs[
          tag]] = map_task_index, producer_index, ix + 1
    self.map_tasks[map_task_index].append((transform_node.full_label, do_op))

    for side_input in transform_node.side_inputs:
      self.dependencies[map_task_index].add(self.outputs[side_input.pvalue][0])
Beispiel #36
0
 def test_get_default_gcp_region_ignores_error(self, patched_environ,
                                               patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertIsNone(result)
Beispiel #37
0
 def test_get_default_gcp_region_from_gcloud(self, patched_environ,
                                             patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertEqual(result, 'some-region2')
Beispiel #38
0
 def test_get_default_gcp_region_no_default_returns_none(
         self, patched_environ, patched_processes):
     runner = DataflowRunner()
     result = runner.get_default_gcp_region()
     self.assertIsNone(result)
Beispiel #39
0
 def _get_coder(self, pvalue, windowed=True):
     # TODO(robertwb): This should be an attribute of the pvalue itself.
     return DataflowRunner._get_coder(
         pvalue.element_type or typehints.Any,
         pvalue.windowing.windowfn.get_window_coder() if windowed else None)
 def _get_coder(self, pvalue, windowed=True):
   # TODO(robertwb): This should be an attribute of the pvalue itself.
   return DataflowRunner._get_coder(
       pvalue.element_type or typehints.Any,
       pvalue.windowing.windowfn.get_window_coder() if windowed else None)
    windowed_avg = (windowed_data
     | "avg1" >> beam.CombinePerKey(beam.combiners.MeanCombineFn())
	)


    class PrintWindowResults(beam.DoFn):
        def process(self, element, window=beam.DoFn.WindowParam):
            new_element = element
            yield new_element



    ( windowed_sum
            | "sum4" >> beam.ParDo(PrintWindowResults())
            | "sum5" >> beam.Map(lambda st: '{{"id": {}, "total_steps": {}}}'.format(st[0],st[1]))
            | "sum6" >> beam.Map(lambda z: bytes(z, "utf-8"))
            | "sum7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out")
    )


    ( windowed_avg  
        | "avg4" >> beam.ParDo(PrintWindowResults())
        | "avg5" >> beam.Map(lambda av: '{{"id": {}, "average_steps": {}}}'.format(av[0],av[1]))
        | "avg6" >> beam.Map(lambda po: bytes(po, "utf-8"))
        | "avg7" >> beam.io.WriteToPubSub(topic="projects/data228/topics/data228-hw8-out")
    )

    DataflowRunner().run_pipeline(pipeline, options=options)