def test_full_completion(self):
    # Create dummy file and close it.  Note that we need to do this because
    # Windows does not allow NamedTemporaryFiles to be reopened elsewhere
    # before the temporary file is closed.
    dummy_file = tempfile.NamedTemporaryFile(delete=False)
    dummy_file_name = dummy_file.name
    dummy_file.close()

    dummy_dir = tempfile.mkdtemp()

    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_file_name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=' + dummy_dir,
                            '--temp_location=/dev/null',
                            '--template_location=' + dummy_file_name,
                            '--no_auth=True']))

    pipeline | beam.Create([1, 2, 3]) | beam.Map(lambda x: x) # pylint: disable=expression-not-assigned
    pipeline.run().wait_until_finish()
    with open(dummy_file_name) as template_file:
      saved_job_dict = json.load(template_file)
      self.assertEqual(
          saved_job_dict['environment']['sdkPipelineOptions']
          ['options']['project'], 'test-project')
      self.assertEqual(
          saved_job_dict['environment']['sdkPipelineOptions']
          ['options']['job_name'], 'test-job')
Example #2
0
  def test_direct_runner_metrics(self):

    class MyDoFn(beam.DoFn):
      def start_bundle(self):
        count = Metrics.counter(self.__class__, 'bundles')
        count.inc()

      def finish_bundle(self):
        count = Metrics.counter(self.__class__, 'finished_bundles')
        count.inc()

      def process(self, element):
        gauge = Metrics.gauge(self.__class__, 'latest_element')
        gauge.set(element)
        count = Metrics.counter(self.__class__, 'elements')
        count.inc()
        distro = Metrics.distribution(self.__class__, 'element_dist')
        distro.update(element)
        return [element]

    runner = DirectRunner()
    p = Pipeline(runner,
                 options=PipelineOptions(self.default_properties))
    pcoll = (p | ptransform.Create([1, 2, 3, 4, 5])
             | 'Do' >> beam.ParDo(MyDoFn()))
    assert_that(pcoll, equal_to([1, 2, 3, 4, 5]))
    result = p.run()
    result.wait_until_finish()
    metrics = result.metrics().query()
    namespace = '{}.{}'.format(MyDoFn.__module__,
                               MyDoFn.__name__)

    hc.assert_that(
        metrics['counters'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'elements')),
                5, 5),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'bundles')),
                1, 1),
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'finished_bundles')),
                1, 1)))

    hc.assert_that(
        metrics['distributions'],
        hc.contains_inanyorder(
            MetricResult(
                MetricKey('Do', MetricName(namespace, 'element_dist')),
                DistributionResult(DistributionData(15, 5, 1, 5)),
                DistributionResult(DistributionData(15, 5, 1, 5)))))

    gauge_result = metrics['gauges'][0]
    hc.assert_that(
        gauge_result.key,
        hc.equal_to(MetricKey('Do', MetricName(namespace, 'latest_element'))))
    hc.assert_that(gauge_result.committed.value, hc.equal_to(5))
    hc.assert_that(gauge_result.attempted.value, hc.equal_to(5))
 def test_biqquery_read_streaming_fail(self):
   remote_runner = DataflowRunner()
   self.default_properties.append("--streaming")
   p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
   _ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
   with self.assertRaisesRegexp(ValueError,
                                r'source is not currently available'):
     p.run()
  def test_remote_runner_translation(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
     | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
     | ptransform.GroupByKey())
    p.run()
Example #5
0
 def run(self, transform, options=None):
   """Run the given transform with this runner.
   """
   # Imported here to avoid circular dependencies.
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam.pipeline import Pipeline
   p = Pipeline(runner=self, options=options)
   p | transform
   return p.run()
Example #6
0
  def test_parent_pointer(self):
    class MyPTransform(beam.PTransform):

      def expand(self, p):
        self.p = p
        return p | beam.Create([None])

    p = beam.Pipeline()
    p | MyPTransform()  # pylint: disable=expression-not-assigned
    p = Pipeline.from_runner_api(Pipeline.to_runner_api(p), None, None)
    self.assertIsNotNone(p.transforms_stack[0].parts[0].parent)
    self.assertEquals(p.transforms_stack[0].parts[0].parent,
                      p.transforms_stack[0])
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    p.run()
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[1]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
Example #8
0
 def run(self, transform, options=None):
   """Run the given transform or callable with this runner.
   """
   # Imported here to avoid circular dependencies.
   # pylint: disable=wrong-import-order, wrong-import-position
   from apache_beam import PTransform
   from apache_beam.pvalue import PBegin
   from apache_beam.pipeline import Pipeline
   p = Pipeline(runner=self, options=options)
   if isinstance(transform, PTransform):
     p | transform
   else:
     transform(PBegin(p))
   return p.run()
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    p.run()
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #10
0
 def test_reuse_custom_transform_instance(self):
   pipeline = Pipeline()
   pcoll1 = pipeline | 'pcoll1' >> Create([1, 2, 3])
   pcoll2 = pipeline | 'pcoll2' >> Create([4, 5, 6])
   transform = PipelineTest.CustomTransform()
   pcoll1 | transform
   with self.assertRaises(RuntimeError) as cm:
     pipeline.apply(transform, pcoll2)
   self.assertEqual(
       cm.exception.args[0],
       'Transform "CustomTransform" does not have a stable unique label. '
       'This will prevent updating of pipelines. '
       'To apply a transform with a specified label write '
       'pvalue | "label" >> transform')
 def setUp(self):
   self.pipeline = Pipeline(DirectRunner())
   self.visitor = ConsumerTrackingPipelineVisitor()
   try:                    # Python 2
     self.assertCountEqual = self.assertItemsEqual
   except AttributeError:  # Python 3
     pass
  def test_bad_path(self):
    dummy_sdk_file = tempfile.NamedTemporaryFile()
    remote_runner = DataflowRunner()
    pipeline = Pipeline(remote_runner,
                        options=PipelineOptions([
                            '--dataflow_endpoint=ignored',
                            '--sdk_location=' + dummy_sdk_file.name,
                            '--job_name=test-job',
                            '--project=test-project',
                            '--staging_location=ignored',
                            '--temp_location=/dev/null',
                            '--template_location=/bad/path',
                            '--no_auth=True']))
    remote_runner.job = apiclient.Job(pipeline._options)

    with self.assertRaises(IOError):
      pipeline.run().wait_until_finish()
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    p = Pipeline(remote_runner, PipelineOptions(self.default_properties))
    p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    remote_runner.job = apiclient.Job(p._options)
    # Performing configured PTransform overrides here.
    p.replace_all(DataflowRunner._PTRANSFORM_OVERRIDES)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 2)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
Example #14
0
 def run(self, request, context):
   job_id = uuid.uuid4().get_hex()
   pipeline_result = Pipeline.from_runner_api(
       request.pipeline,
       'DirectRunner',
       PipelineOptions()).run()
   self.jobs[job_id] = pipeline_result
   return beam_job_api_pb2.SubmitJobResponse(jobId=job_id)
Example #15
0
  def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> Create([1, 2, 3])
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual(set([pcoll1, pcoll2, pcoll3, pcoll4, pcoll5]),
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite),
                     set(visitor.leave_composite))
    self.assertEqual(3, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[2].transform, transform)
    self.assertEqual(visitor.leave_composite[1].transform, transform)
Example #16
0
  def run_async(self, transform, options=None):
    """Run the given transform or callable with this runner.

    May return immediately, executing the pipeline in the background.
    The returned result object can be queried for progress, and
    `wait_until_finish` may be called to block until completion.
    """
    # Imported here to avoid circular dependencies.
    # pylint: disable=wrong-import-order, wrong-import-position
    from apache_beam import PTransform
    from apache_beam.pvalue import PBegin
    from apache_beam.pipeline import Pipeline
    p = Pipeline(runner=self, options=options)
    if isinstance(transform, PTransform):
      p | transform
    else:
      transform(PBegin(p))
    return p.run()
  def test_use_fastavro_experiment_is_added_on_py3_and_onwards(self):
    remote_runner = DataflowRunner()

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    self.assertEqual(
        sys.version_info[0] > 2,
        remote_runner.job.options.view_as(DebugOptions).lookup_experiment(
            'use_fastavro', False))
Example #18
0
 def test_biqquery_read_fn_api_fail(self):
     remote_runner = DataflowRunner()
     for flag in ['beam_fn_api', 'use_unified_worker', 'use_runner_v2']:
         self.default_properties.append("--experiments=%s" % flag)
         with self.assertRaisesRegex(
                 ValueError, 'The Read.BigQuerySource.*is not supported.*'
                 'apache_beam.io.gcp.bigquery.ReadFromBigQuery.*'):
             with Pipeline(remote_runner,
                           PipelineOptions(self.default_properties)) as p:
                 _ = p | beam.io.Read(beam.io.BigQuerySource('some.table'))
Example #19
0
    def test_min_cpu_platform_flag_is_propagated_to_experiments(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--min_cpu_platform=Intel Haswell')

        with Pipeline(remote_runner,
                      PipelineOptions(self.default_properties)) as p:
            p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
        self.assertIn(
            'min_cpu_platform=Intel Haswell',
            remote_runner.job.options.view_as(DebugOptions).experiments)
Example #20
0
  def test_pipeline_sdk_not_overridden(self):
    pipeline_options = PipelineOptions([
        '--experiments=beam_fn_api',
        '--experiments=use_unified_worker',
        '--temp_location',
        'gs://any-location/temp',
        '--worker_harness_container_image=dummy_prefix/dummy_name:dummy_tag'
    ])

    pipeline = Pipeline(options=pipeline_options)
    pipeline | Create([1, 2, 3]) | ParDo(DoFn())  # pylint:disable=expression-not-assigned

    proto_pipeline, _ = pipeline.to_runner_api(return_context=True)

    dummy_env = beam_runner_api_pb2.Environment(
        urn=common_urns.environments.DOCKER.urn,
        payload=(
            beam_runner_api_pb2.DockerPayload(
                container_image='dummy_prefix/dummy_name:dummy_tag')
        ).SerializeToString())
    proto_pipeline.components.environments['dummy_env_id'].CopyFrom(dummy_env)

    dummy_transform = beam_runner_api_pb2.PTransform(
        environment_id='dummy_env_id')
    proto_pipeline.components.transforms['dummy_transform_id'].CopyFrom(
        dummy_transform)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, dict(), pipeline_options)

    self.assertIsNotNone(2, len(proto_pipeline.components.environments))

    from apache_beam.utils import proto_utils
    found_override = False
    for env in proto_pipeline.components.environments.values():
      docker_payload = proto_utils.parse_Bytes(
          env.payload, beam_runner_api_pb2.DockerPayload)
      if docker_payload.container_image.startswith(
          names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY):
        found_override = True

    self.assertFalse(found_override)
Example #21
0
 def test_environment_override_translation(self):
     self.default_properties.append('--experiments=beam_fn_api')
     self.default_properties.append('--worker_harness_container_image=FOO')
     remote_runner = DataflowRunner()
     p = Pipeline(remote_runner,
                  options=PipelineOptions(self.default_properties))
     (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
      | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
      | ptransform.GroupByKey())
     p.run()
     self.assertEqual(
         list(
             remote_runner.proto_pipeline.components.environments.values()),
         [
             beam_runner_api_pb2.Environment(
                 urn=common_urns.environments.DOCKER.urn,
                 payload=beam_runner_api_pb2.DockerPayload(
                     container_image='FOO').SerializeToString())
         ])
Example #22
0
    def test_remote_runner_translation(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        (p | ptransform.Create([1, 2, 3])  # pylint: disable=expression-not-assigned
         | 'Do' >> ptransform.FlatMap(lambda x: [(x, x)])
         | ptransform.GroupByKey())
        remote_runner.job = apiclient.Job(p.options)
        super(DataflowRunner, remote_runner).run(p)
  def test_upload_graph_experiment(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=upload_graph')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('upload_graph', experiments_for_job)
  def test_use_fastavro_experiment_is_not_added_when_use_avro_is_present(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=use_avro')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    debug_options = remote_runner.job.options.view_as(DebugOptions)

    self.assertFalse(debug_options.lookup_experiment('use_fastavro', False))
Example #25
0
    def test_unsupported_fnapi_features(self):
        remote_runner = DataflowRunner()
        self.default_properties.append('--experiment=beam_fn_api')
        self.default_properties.append('--experiment=use_runner_v2')

        with self.assertRaisesRegex(RuntimeError, 'Unsupported merging'):
            with Pipeline(remote_runner,
                          options=PipelineOptions(
                              self.default_properties)) as p:
                # pylint: disable=expression-not-assigned
                p | beam.Create([]) | beam.WindowInto(CustomMergingWindowFn())

        with self.assertRaisesRegex(RuntimeError, 'Unsupported window coder'):
            with Pipeline(remote_runner,
                          options=PipelineOptions(
                              self.default_properties)) as p:
                # pylint: disable=expression-not-assigned
                p | beam.Create([]) | beam.WindowInto(
                    CustomWindowTypeWindowFn())
Example #26
0
    def test_remote_runner_display_data(self):
        remote_runner = DataflowRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions(self.default_properties))

        now = datetime.now()
        # pylint: disable=expression-not-assigned
        (p | ptransform.Create([1, 2, 3, 4, 5])
         | 'Do' >> SpecialParDo(SpecialDoFn(), now))

        # TODO(BEAM-366) Enable runner API on this test.
        p.run(test_runner_api=False)
        job_dict = json.loads(str(remote_runner.job))
        steps = [
            step for step in job_dict['steps']
            if len(step['properties'].get('display_data', [])) > 0
        ]
        step = steps[1]
        disp_data = step['properties']['display_data']
        nspace = SpecialParDo.__module__ + '.'
        expected_data = [{
            'type':
            'TIMESTAMP',
            'namespace':
            nspace + 'SpecialParDo',
            'value':
            DisplayDataItem._format_value(now, 'TIMESTAMP'),
            'key':
            'a_time'
        }, {
            'type': 'STRING',
            'namespace': nspace + 'SpecialParDo',
            'value': nspace + 'SpecialParDo',
            'key': 'a_class',
            'shortValue': 'SpecialParDo'
        }, {
            'type': 'INTEGER',
            'namespace': nspace + 'SpecialDoFn',
            'value': 42,
            'key': 'dofn_value'
        }]
        self.assertUnhashableCountEqual(disp_data, expected_data)
Example #27
0
    def test_simple(self):
        """Tests serializing, deserializing, and running a simple pipeline.

    More extensive tests are done at pipeline.run for each suitable test.
    """
        p = beam.Pipeline()
        p | beam.Create([None]) | beam.Map(lambda x: x)  # pylint: disable=expression-not-assigned
        proto = p.to_runner_api()

        p2 = Pipeline.from_runner_api(proto, p.runner, p.options)
        p2.run()
Example #28
0
  def test_simple(self):
    """Tests serializing, deserializing, and running a simple pipeline.

    More extensive tests are done at pipeline.run for each suitable test.
    """
    p = beam.Pipeline()
    p | beam.Create([None]) | beam.Map(lambda x: x)  # pylint: disable=expression-not-assigned
    proto = p.to_runner_api()

    p2 = Pipeline.from_runner_api(proto, p.runner, p._options)
    p2.run()
Example #29
0
    def test_transform_ids(self):
        class MyPTransform(beam.PTransform):
            def expand(self, p):
                self.p = p
                return p | beam.Create([None])

        p = beam.Pipeline()
        p | MyPTransform()  # pylint: disable=expression-not-assigned
        runner_api_proto = Pipeline.to_runner_api(p)

        for transform_id in runner_api_proto.components.transforms:
            self.assertRegex(transform_id, r'[a-zA-Z0-9-_]+')
  def test_dataflow_worker_jar_flag_adds_use_staged_worker_jar_experiment(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--experiment=beam_fn_api')
    self.default_properties.append('--dataflow_worker_jar=test.jar')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('beam_fn_api', experiments_for_job)
    self.assertIn('use_staged_dataflow_worker_jar', experiments_for_job)
Example #31
0
    def test_ptransform_override_replacement_inputs(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return (isinstance(applied_ptransform.transform, ParDo)
                        and isinstance(applied_ptransform.transform.fn,
                                       AddWithProductDoFn))

            def get_replacement_transform(self, transform):
                return AddThenMultiply()

            def get_replacement_inputs(self, applied_ptransform):
                assert len(applied_ptransform.inputs) == 1
                assert len(applied_ptransform.side_inputs) == 2
                # Swap the order of the two side inputs
                return (applied_ptransform.inputs[0],
                        applied_ptransform.side_inputs[1].pvalue,
                        applied_ptransform.side_inputs[0].pvalue)

        p = Pipeline()
        pcoll1 = p | 'pc1' >> beam.Create([2])
        pcoll2 = p | 'pc2' >> beam.Create([3])
        pcoll3 = p | 'pc3' >> beam.Create([4, 5, 6])
        result = pcoll3 | 'Operate' >> beam.ParDo(
            AddWithProductDoFn(), AsSingleton(pcoll1), AsSingleton(pcoll2))
        assert_that(result, equal_to([14, 16, 18]))

        p.replace_all([MyParDoOverride()])
        p.run()
Example #32
0
  def test_remote_runner_display_data(self):
    remote_runner = DataflowRunner()
    p = Pipeline(remote_runner,
                 options=PipelineOptions(self.default_properties))

    # TODO: Should not subclass ParDo. Switch to PTransform as soon as
    # composite transforms support display data.
    class SpecialParDo(beam.ParDo):
      def __init__(self, fn, now):
        super(SpecialParDo, self).__init__(fn)
        self.fn = fn
        self.now = now

      # Make this a list to be accessible within closure
      def display_data(self):
        return {'asubcomponent': self.fn,
                'a_class': SpecialParDo,
                'a_time': self.now}

    class SpecialDoFn(beam.DoFn):
      def display_data(self):
        return {'dofn_value': 42}

      def process(self):
        pass

    now = datetime.now()
    # pylint: disable=expression-not-assigned
    (p | ptransform.Create([1, 2, 3, 4, 5])
     | 'Do' >> SpecialParDo(SpecialDoFn(), now))

    remote_runner.job = apiclient.Job(p.options)
    super(DataflowRunner, remote_runner).run(p)
    job_dict = json.loads(str(remote_runner.job))
    steps = [step
             for step in job_dict['steps']
             if len(step['properties'].get('display_data', [])) > 0]
    step = steps[0]
    disp_data = step['properties']['display_data']
    disp_data = sorted(disp_data, key=lambda x: x['namespace']+x['key'])
    nspace = SpecialParDo.__module__+ '.'
    expected_data = [{'type': 'TIMESTAMP', 'namespace': nspace+'SpecialParDo',
                      'value': DisplayDataItem._format_value(now, 'TIMESTAMP'),
                      'key': 'a_time'},
                     {'type': 'STRING', 'namespace': nspace+'SpecialParDo',
                      'value': nspace+'SpecialParDo', 'key': 'a_class',
                      'shortValue': 'SpecialParDo'},
                     {'type': 'INTEGER', 'namespace': nspace+'SpecialDoFn',
                      'value': 42, 'key': 'dofn_value'}]
    expected_data = sorted(expected_data, key=lambda x: x['namespace']+x['key'])
    self.assertEqual(len(disp_data), 3)
    self.assertEqual(disp_data, expected_data)
Example #33
0
    def test_sdk_harness_container_image_overrides(self):
        if 'sdkHarnessContainerImages' not in dataflow.WorkerPool.__dict__:
            _LOGGER.warning(
                'Skipping test \'test_sdk_harness_container_image_overrides\' since '
                'Dataflow API WorkerPool does not have attribute '
                '\'sdkHarnessContainerImages\'')
            return
        pipeline_options = PipelineOptions([
            '--experiments=beam_fn_api',
            '--experiments=use_unified_worker',
            '--temp_location',
            'gs://any-location/temp',
            '--project',
            'dummy_project',
            '--sdk_harness_container_image_overrides',
            '.*dummy.*,new_dummy_container_image',
        ])

        pipeline = Pipeline(options=pipeline_options)

        test_environment = DockerEnvironment(
            container_image='dummy_container_image')
        proto_pipeline, _ = pipeline.to_runner_api(
            return_context=True, default_environment=test_environment)
        dataflow_client = apiclient.DataflowApplicationClient(pipeline_options)

        # Accessing non-public method for testing.
        dataflow_client._apply_sdk_environment_overrides(proto_pipeline)

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertEqual(docker_payload.container_image,
                         'new_dummy_container_image')
Example #34
0
    def run_async(
            self,
            transform,  # type: PTransform
            options=None  # type: Optional[PipelineOptions]
    ):
        # type: (...) -> PipelineResult
        """Run the given transform or callable with this runner.

    May return immediately, executing the pipeline in the background.
    The returned result object can be queried for progress, and
    `wait_until_finish` may be called to block until completion.
    """
        # Imported here to avoid circular dependencies.
        # pylint: disable=wrong-import-order, wrong-import-position
        from apache_beam import PTransform
        from apache_beam.pvalue import PBegin
        from apache_beam.pipeline import Pipeline
        p = Pipeline(runner=self, options=options)
        if isinstance(transform, PTransform):
            p | transform
        else:
            transform(PBegin(p))
        return p.run()
  def test_streaming_create_translation(self):
    remote_runner = DataflowRunner()
    self.default_properties.append("--streaming")
    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned
    job_dict = json.loads(str(remote_runner.job))
    self.assertEqual(len(job_dict[u'steps']), 3)

    self.assertEqual(job_dict[u'steps'][0][u'kind'], u'ParallelRead')
    self.assertEqual(
        job_dict[u'steps'][0][u'properties'][u'pubsub_subscription'],
        '_starting_signal/')
    self.assertEqual(job_dict[u'steps'][1][u'kind'], u'ParallelDo')
    self.assertEqual(job_dict[u'steps'][2][u'kind'], u'ParallelDo')
  def test_streaming_engine_flag_adds_windmill_experiments(self):
    remote_runner = DataflowRunner()
    self.default_properties.append('--streaming')
    self.default_properties.append('--enable_streaming_engine')
    self.default_properties.append('--experiment=some_other_experiment')

    with Pipeline(remote_runner, PipelineOptions(self.default_properties)) as p:
      p | ptransform.Create([1])  # pylint: disable=expression-not-assigned

    experiments_for_job = (
        remote_runner.job.options.view_as(DebugOptions).experiments)
    self.assertIn('enable_streaming_engine', experiments_for_job)
    self.assertIn('enable_windmill_service', experiments_for_job)
    self.assertIn('some_other_experiment', experiments_for_job)
Example #37
0
 def test_no_group_by_key_directly_after_bigquery(self):
     remote_runner = DataflowRunner()
     p = Pipeline(remote_runner,
                  options=PipelineOptions([
                      '--dataflow_endpoint=ignored', '--job_name=test-job',
                      '--project=test-project',
                      '--staging_location=ignored',
                      '--temp_location=/dev/null', '--no_auth'
                  ]))
     rows = p | beam.io.Read(beam.io.BigQuerySource('dataset.faketable'))
     with self.assertRaises(ValueError,
                            msg=('Coder for the GroupByKey operation'
                                 '"GroupByKey" is not a key-value coder: '
                                 'RowAsDictJsonCoder')):
         unused_invalid = rows | beam.GroupByKey()
Example #38
0
 def test_after_count(self):
   p = Pipeline('DirectRunner')
   result = (p
             | beam.Create([1, 2, 3, 4, 5, 10, 11])
             | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
             | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
             | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                               accumulation_mode=AccumulationMode.DISCARDING)
             | beam.GroupByKey()
             | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
   assert_that(result, equal_to(
       {
           'A-5': {1, 2, 3, 4, 5},
           # A-10, A-11 never emitted due to AfterCount(3) never firing.
           'B-4': {6, 7, 8, 9},
           'B-3': {10, 15, 16},
       }.iteritems()))
Example #39
0
    def test_pcollectionview_not_recreated(self):
        pipeline = Pipeline('DirectRunner')
        value = pipeline | 'create1' >> Create([1, 2, 3])
        value2 = pipeline | 'create2' >> Create([(1, 1), (2, 2), (3, 3)])
        value3 = pipeline | 'create3' >> Create([(1, 1), (2, 2), (3, 3)])
        self.assertEqual(AsSingleton(value), AsSingleton(value))
        self.assertEqual(AsSingleton('new', value, default_value=1),
                         AsSingleton('new', value, default_value=1))
        self.assertNotEqual(AsSingleton(value),
                            AsSingleton('new', value, default_value=1))
        self.assertEqual(AsIter(value), AsIter(value))
        self.assertEqual(AsList(value), AsList(value))
        self.assertEqual(AsDict(value2), AsDict(value2))

        self.assertNotEqual(AsSingleton(value), AsSingleton(value2))
        self.assertNotEqual(AsIter(value), AsIter(value2))
        self.assertNotEqual(AsList(value), AsList(value2))
        self.assertNotEqual(AsDict(value2), AsDict(value3))
  def test_ptransform_overrides(self, file_system_override_mock):
    class MyParDoOverride(PTransformOverride):
      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, DoubleParDo)

      def get_replacement_transform(self, ptransform):
        if isinstance(ptransform, DoubleParDo):
          return TripleParDo()
        raise ValueError('Unsupported type of transform: %r' % ptransform)

    def get_overrides(unused_pipeline_options):
      return [MyParDoOverride()]

    file_system_override_mock.side_effect = get_overrides

    # Specify DirectRunner as it's the one patched above.
    with Pipeline(runner='BundleBasedDirectRunner') as p:
      pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
      assert_that(pcoll, equal_to([3, 6, 9]))
Example #41
0
    def test_ptransform_overrides(self):
        def my_par_do_matcher(applied_ptransform):
            return isinstance(applied_ptransform.transform, DoubleParDo)

        class MyParDoOverride(PTransformOverride):
            def get_matcher(self):
                return my_par_do_matcher

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r',
                                 ptransform)

        # Using following private variable for testing.
        DirectRunner._PTRANSFORM_OVERRIDES.append(MyParDoOverride())
        with Pipeline() as p:
            pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
            assert_that(pcoll, equal_to([3, 6, 9]))
Example #42
0
  def test_sdk_harness_container_image_overrides(self):
    test_environment = DockerEnvironment(
        container_image='dummy_container_image')
    proto_pipeline, _ = Pipeline().to_runner_api(
      return_context=True, default_environment=test_environment)

    # Accessing non-public method for testing.
    apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
        proto_pipeline, {'.*dummy.*': 'new_dummy_container_image'})

    self.assertIsNotNone(1, len(proto_pipeline.components.environments))
    env = list(proto_pipeline.components.environments.values())[0]

    from apache_beam.utils import proto_utils
    docker_payload = proto_utils.parse_Bytes(
        env.payload, beam_runner_api_pb2.DockerPayload)

    # Container image should be overridden by a the given override.
    self.assertEqual(
        docker_payload.container_image, 'new_dummy_container_image')
Example #43
0
    def test_dataflow_container_image_override(self):
        test_environment = DockerEnvironment(
            container_image='apache/beam_java11_sdk:x.yz.0')
        proto_pipeline, _ = Pipeline().to_runner_api(
            return_context=True, default_environment=test_environment)

        # Accessing non-public method for testing.
        apiclient.DataflowApplicationClient._apply_sdk_environment_overrides(
            proto_pipeline, dict())

        self.assertIsNotNone(1, len(proto_pipeline.components.environments))
        env = list(proto_pipeline.components.environments.values())[0]

        from apache_beam.utils import proto_utils
        docker_payload = proto_utils.parse_Bytes(
            env.payload, beam_runner_api_pb2.DockerPayload)

        # Container image should be overridden by a the given override.
        self.assertTrue(
            docker_payload.container_image.startswith(
                names.DATAFLOW_CONTAINER_IMAGE_REPOSITORY))
Example #44
0
    def test_visitor_not_sorted(self):
        p = Pipeline()
        # pylint: disable=expression-not-assigned
        from apache_beam.testing.test_stream import TestStream
        p | TestStream().add_elements(['']) | beam.Map(lambda _: _)

        original_graph = p.to_runner_api(return_context=False)
        out_of_order_graph = p.to_runner_api(return_context=False)

        root_id = out_of_order_graph.root_transform_ids[0]
        root = out_of_order_graph.components.transforms[root_id]
        tmp = root.subtransforms[0]
        root.subtransforms[0] = root.subtransforms[1]
        root.subtransforms[1] = tmp

        p = beam.Pipeline().from_runner_api(out_of_order_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_out_of_order = ConsumerTrackingPipelineVisitor()
        p.visit(v_out_of_order)

        p = beam.Pipeline().from_runner_api(original_graph,
                                            runner='BundleBasedDirectRunner',
                                            options=None)
        v_original = ConsumerTrackingPipelineVisitor()
        p.visit(v_original)

        # Convert to string to assert they are equal.
        out_of_order_labels = {
            str(k): [str(t) for t in v_out_of_order.value_to_consumers[k]]
            for k in v_out_of_order.value_to_consumers
        }

        original_labels = {
            str(k): [str(t) for t in v_original.value_to_consumers[k]]
            for k in v_original.value_to_consumers
        }
        self.assertDictEqual(out_of_order_labels, original_labels)
Example #45
0
  def test_ptransform_override_multiple_inputs(self):
    class MyParDoOverride(PTransformOverride):
      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, FlattenAndDouble)

      def get_replacement_transform(self, applied_ptransform):
        return FlattenAndTriple()

    p = Pipeline()
    pcoll1 = p | 'pc1' >> beam.Create([1, 2, 3])
    pcoll2 = p | 'pc2' >> beam.Create([4, 5, 6])
    pcoll3 = (pcoll1, pcoll2) | 'FlattenAndMultiply' >> FlattenAndDouble()
    assert_that(pcoll3, equal_to([3, 6, 9, 12, 15, 18]))

    p.replace_all([MyParDoOverride()])
    p.run()
Example #46
0
    def test_ptransform_overrides(self, file_system_override_mock):
        def my_par_do_matcher(applied_ptransform):
            return isinstance(applied_ptransform.transform, DoubleParDo)

        class MyParDoOverride(PTransformOverride):
            def get_matcher(self):
                return my_par_do_matcher

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r',
                                 ptransform)

        def get_overrides():
            return [MyParDoOverride()]

        file_system_override_mock.side_effect = get_overrides

        with Pipeline() as p:
            pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
            assert_that(pcoll, equal_to([3, 6, 9]))
Example #47
0
    def test_ptransform_overrides(self):
        class MyParDoOverride(PTransformOverride):
            def matches(self, applied_ptransform):
                return isinstance(applied_ptransform.transform, DoubleParDo)

            def get_replacement_transform(self, ptransform):
                if isinstance(ptransform, DoubleParDo):
                    return TripleParDo()
                raise ValueError('Unsupported type of transform: %r' %
                                 ptransform)

        p = Pipeline()
        pcoll = p | beam.Create([1, 2, 3]) | 'Multiply' >> DoubleParDo()
        assert_that(pcoll, equal_to([3, 6, 9]))

        p.replace_all([MyParDoOverride()])
        p.run()
class ConsumerTrackingPipelineVisitorTest(unittest.TestCase):

  def setUp(self):
    self.pipeline = Pipeline(DirectRunner())
    self.visitor = ConsumerTrackingPipelineVisitor()

  def test_root_transforms(self):
    class DummySource(iobase.BoundedSource):
      pass

    root_read = Read(DummySource())
    root_flatten = Flatten(pipeline=self.pipeline)

    pbegin = pvalue.PBegin(self.pipeline)
    pcoll_read = pbegin | 'read' >> root_read
    pcoll_read | FlatMap(lambda x: x)
    [] | 'flatten' >> root_flatten

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])

    self.assertEqual(root_transforms, sorted(
        [root_read, root_flatten]))

    pbegin_consumers = sorted(
        [c.transform for c in self.visitor.value_to_consumers[pbegin]])
    self.assertEqual(pbegin_consumers, sorted([root_read]))
    self.assertEqual(len(self.visitor.step_names), 3)

  def test_side_inputs(self):

    class SplitNumbersFn(DoFn):

      def process(self, element):
        if element < 0:
          yield pvalue.OutputValue('tag_negative', element)
        else:
          yield element

    class ProcessNumbersFn(DoFn):

      def process(self, element, negatives):
        yield element

    class DummySource(iobase.BoundedSource):
      pass

    root_read = Read(DummySource())

    result = (self.pipeline
              | 'read' >> root_read
              | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                     main='positive'))
    positive, negative = result
    positive | ParDo(ProcessNumbersFn(), AsList(negative))

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])
    self.assertEqual(root_transforms, sorted([root_read]))
    self.assertEqual(len(self.visitor.step_names), 3)
    self.assertEqual(len(self.visitor.views), 1)
    self.assertTrue(isinstance(self.visitor.views[0],
                               pvalue.AsList))

  def test_co_group_by_key(self):
    emails = self.pipeline | 'email' >> Create([('joe', '*****@*****.**')])
    phones = self.pipeline | 'phone' >> Create([('mary', '111-222-3333')])
    {'emails': emails, 'phones': phones} | CoGroupByKey()

    self.pipeline.visit(self.visitor)

    root_transforms = sorted(
        [t.transform for t in self.visitor.root_transforms])
    self.assertEqual(len(root_transforms), 2)
    self.assertGreater(
        len(self.visitor.step_names), 3)  # 2 creates + expanded CoGBK
    self.assertEqual(len(self.visitor.views), 0)
 def setUp(self):
   self.pipeline = Pipeline(DirectRunner())
   self.visitor = ConsumerTrackingPipelineVisitor()