Example #1
0
 def test_implicit_payload_builder_with_bytes(self):
   values = PayloadBase.bytes_values
   builder = ImplicitSchemaPayloadBuilder(values)
   result = builder.build()
   if sys.version_info[0] < 3:
     # in python 2.x bytes coder will be inferred
     args = {
         'integer_example': ConfigValue(
             coder_urn=['beam:coder:varint:v1'],
             payload=VarIntCoder()
             .get_impl().encode_nested(values['integer_example'])),
         'string_example': ConfigValue(
             coder_urn=['beam:coder:bytes:v1'],
             payload=StrUtf8Coder()
             .get_impl().encode_nested(values['string_example'])),
         'list_of_strings': ConfigValue(
             coder_urn=['beam:coder:iterable:v1',
                        'beam:coder:bytes:v1'],
             payload=IterableCoder(StrUtf8Coder())
             .get_impl().encode_nested(values['list_of_strings'])),
         'optional_kv': ConfigValue(
             coder_urn=['beam:coder:kv:v1',
                        'beam:coder:bytes:v1',
                        'beam:coder:double:v1'],
             payload=TupleCoder([StrUtf8Coder(), FloatCoder()])
             .get_impl().encode_nested(values['optional_kv'])),
     }
     expected = get_payload(args)
     self.assertEqual(result, expected)
   else:
     expected = get_payload(PayloadBase.args)
     self.assertEqual(result, expected)
Example #2
0
    def test_implicit_payload_builder(self):
        builder = ImplicitSchemaPayloadBuilder(PayloadBase.values)
        result = builder.build()

        decoded = RowCoder(result.schema).decode(result.payload)
        for key, value in PayloadBase.values.items():
            # Note the default value in the getattr call.
            # ImplicitSchemaPayloadBuilder omits fields with valu=None since their
            # type cannot be inferred.
            self.assertEqual(getattr(decoded, key, None), value)
Example #3
0
    def test_external_empty_spec_translation(self):
        pipeline = beam.Pipeline()
        external_transform = beam.ExternalTransform(
            'beam:transforms:xlang:test:prefix',
            ImplicitSchemaPayloadBuilder({'data': u'0'}),
            expansion_service.ExpansionServiceServicer())
        _ = (pipeline | beam.Create(['a', 'b']) | external_transform)
        pipeline.run().wait_until_finish()

        external_transform_label = (
            'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel')
        for transform in external_transform._expanded_components.transforms.values(
        ):
            # We clear the spec of one of the external transforms.
            if transform.unique_name == external_transform_label:
                transform.spec.Clear()

        context = pipeline_context.PipelineContext()
        proto_pipeline = pipeline.to_runner_api(context=context)

        proto_transform = None
        for transform in proto_pipeline.components.transforms.values():
            if (transform.unique_name ==
                    'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel'
                ):
                proto_transform = transform

        self.assertIsNotNone(proto_transform)
        self.assertTrue(str(proto_transform).strip().find('spec {') == -1)
 def test_xlang_parquetio_write(self):
     expansion_jar = os.environ.get('EXPANSION_JAR')
     port = os.environ.get('EXPANSION_PORT')
     address = 'localhost:%s' % port
     try:
         with TestPipeline() as p:
             p.get_pipeline_options().view_as(
                 DebugOptions).experiments.append('jar_packages=' +
                                                  expansion_jar)
             p.not_use_test_runner_api = True
             _ = p \
               | beam.Create([
                   AvroRecord({"name": "abc"}), AvroRecord({"name": "def"}),
                   AvroRecord({"name": "ghi"})]) \
               | beam.ExternalTransform(
                   PARQUET_WRITE_URN,
                   ImplicitSchemaPayloadBuilder({'data': u'/tmp/test.parquet'}),
                   address)
     except RuntimeError as e:
         if re.search(PARQUET_WRITE_URN, str(e)):
             print(
                 "looks like URN not implemented in expansion service, skipping."
             )
         else:
             raise e
Example #5
0
    def test_implicit_payload_builder_with_bytes(self):
        values = PayloadBase.bytes_values
        builder = ImplicitSchemaPayloadBuilder(values)
        result = builder.build()

        decoded = RowCoder(result.schema).decode(result.payload)
        for key, value in PayloadBase.values.items():
            # Note the default value in the getattr call.
            # ImplicitSchemaPayloadBuilder omits fields with valu=None since their
            # type cannot be inferred.
            self.assertEqual(getattr(decoded, key, None), value)

        # Verify we have not modified a cached type (BEAM-10766)
        # TODO(BEAM-7372): Remove when bytes coercion code is removed.
        self.assertEqual(typehints.List[bytes],
                         convert_to_beam_type(typing.List[bytes]))
    def run_pipeline(pipeline_options,
                     expansion_service,
                     wait_until_finish=True):
        # The actual definitions of these transforms is in
        # org.apache.beam.runners.core.construction.TestExpansionService.
        TEST_COUNT_URN = "beam:transforms:xlang:count"
        TEST_FILTER_URN = "beam:transforms:xlang:filter_less_than_eq"

        # Run a simple count-filtered-letters pipeline.
        p = TestPipeline(options=pipeline_options)

        if isinstance(expansion_service, int):
            # Only the port was specified.
            expansion_service = 'localhost:%s' % str(expansion_service)

        res = (p
               | beam.Create(list('aaabccxyyzzz'))
               | beam.Map(unicode)
               | beam.ExternalTransform(
                   TEST_FILTER_URN,
                   ImplicitSchemaPayloadBuilder({'data': u'middle'}),
                   expansion_service)
               | beam.ExternalTransform(TEST_COUNT_URN, None,
                                        expansion_service)
               | beam.Map(lambda kv: '%s: %s' % kv))

        assert_that(res, equal_to(['a: 3', 'b: 1', 'c: 2']))

        result = p.run()
        if wait_until_finish:
            result.wait_until_finish()
Example #7
0
 def run_prefix(self, pipeline):
     with pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    self.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
Example #8
0
    def test_implicit_payload_builder_with_bytes(self):
        values = PayloadBase.bytes_values
        builder = ImplicitSchemaPayloadBuilder(values)
        result = builder.build()

        decoded = RowCoder(result.schema).decode(result.payload)
        if sys.version_info[0] < 3:
            for key, value in PayloadBase.bytes_values.items():
                # Note the default value in the getattr call.
                # ImplicitSchemaPayloadBuilder omits fields with valu=None since their
                # type cannot be inferred.
                self.assertEqual(getattr(decoded, key, None), value)
        else:
            for key, value in PayloadBase.values.items():
                # Note the default value in the getattr call.
                # ImplicitSchemaPayloadBuilder omits fields with valu=None since their
                # type cannot be inferred.
                self.assertEqual(getattr(decoded, key, None), value)
Example #9
0
    def test_external_transform_finder_leaf(self):
        pipeline = beam.Pipeline()
        _ = (pipeline
             | beam.Create(['a', 'b'])
             | beam.ExternalTransform(
                 'beam:transforms:xlang:test:nooutput',
                 ImplicitSchemaPayloadBuilder({'data': u'0'}),
                 expansion_service.ExpansionServiceServicer()))
        pipeline.run().wait_until_finish()

        self.assertTrue(pipeline.contains_external_transforms)
 def test_prefix(self):
     test_pipeline = TestPipeline()
     test_pipeline.get_pipeline_options().view_as(
         DebugOptions).experiments.append(
             'jar_packages=' + ValidateRunnerXlangTest.expansion_jar)
     test_pipeline.not_use_test_runner_api = True
     with test_pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    ValidateRunnerXlangTest.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
Example #11
0
 def __init__(self, start, stop=None,
              elements_per_period=None, max_read_time=None,
              expansion_service=None):
   super(GenerateSequence, self).__init__(
       self.URN,
       ImplicitSchemaPayloadBuilder(
           {
               'start': start,
               'stop': stop,
               'elements_per_period': elements_per_period,
               'max_read_time': max_read_time,
           }
       ),
       expansion_service)
Example #12
0
 def test_as_external_transform_no_kwargs(self):
   with FullyQualifiedNamedTransform.with_filter('*'):
     with beam.Pipeline() as p:
       assert_that(
           p
           | beam.Create(['a', 'b', 'c'])
           | beam.ExternalTransform(
               PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN,
               ImplicitSchemaPayloadBuilder({
                   'constructor': 'apache_beam.transforms'
                   '.fully_qualified_named_transform_test._TestTransform',
                   'args': beam.Row(arg0='x', arg1='y'),
               }),
               expansion_service.ExpansionServiceServicer()),
           equal_to(['xay', 'xby', 'xcy']))
Example #13
0
def run(input_path, output_path, expansion_service_port, pipeline_args):
    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        input = p | 'Read' >> ReadFromText(input_path).with_output_types(str)

        java_output = (input
                       | 'JavaPrefix' >> beam.ExternalTransform(
                           'beam:transform:org.apache.beam:javaprefix:v1',
                           ImplicitSchemaPayloadBuilder({'prefix': 'java:'}),
                           ('localhost:%s' % expansion_service_port)))

        def python_prefix(record):
            return 'python:%s' % record

        output = java_output | 'PythonPrefix' >> beam.Map(python_prefix)
        output | 'Write' >> WriteToText(output_path)
Example #14
0
  def test_pipeline_generation_with_runner_overrides(self):
    pipeline_properties = [
        '--dataflow_endpoint=ignored',
        '--job_name=test-job',
        '--project=test-project',
        '--staging_location=ignored',
        '--temp_location=/dev/null',
        '--no_auth',
        '--dry_run=True',
        '--sdk_location=container',
        '--runner=DataflowRunner',
        '--streaming'
    ]

    with beam.Pipeline(options=PipelineOptions(pipeline_properties)) as p:
      _ = (
          p
          | beam.io.ReadFromPubSub(
              subscription=
              'projects/dummy-project/subscriptions/dummy-subscription')
          | beam.ExternalTransform(
              'beam:transforms:xlang:test:prefix',
              ImplicitSchemaPayloadBuilder({'data': u'0'}),
              expansion_service.ExpansionServiceServicer()))

    pipeline_proto, _ = p.to_runner_api(return_context=True)

    pubsub_read_transform = None
    external_transform = None
    proto_transforms = pipeline_proto.components.transforms
    for id in proto_transforms:
      if 'beam:transforms:xlang:test:prefix' in proto_transforms[
          id].unique_name:
        external_transform = proto_transforms[id]
      if 'ReadFromPubSub' in proto_transforms[id].unique_name:
        pubsub_read_transform = proto_transforms[id]

    if not (pubsub_read_transform and external_transform):
      raise ValueError(
          'Could not find an external transform and the PubSub read transform '
          'in the pipeline')

    self.assertEqual(1, len(list(pubsub_read_transform.outputs.values())))
    self.assertEqual(
        list(pubsub_read_transform.outputs.values()),
        list(external_transform.inputs.values()))
 def run_prefix(self, pipeline):
     """
 Target transform - ParDo
 (https://beam.apache.org/documentation/programming-guide/#pardo)
 Test scenario - Mapping elements from a single input collection to a
 single output collection
 Boundary conditions checked -
  - PCollection<?> to external transforms
  - PCollection<?> from external transforms
 """
     with pipeline as p:
         res = (p
                | beam.Create(['a', 'b']).with_output_types(unicode)
                | beam.ExternalTransform(
                    TEST_PREFIX_URN,
                    ImplicitSchemaPayloadBuilder({'data': u'0'}),
                    self.expansion_service))
         assert_that(res, equal_to(['0a', '0b']))
Example #16
0
    def test_pipeline_generation(self):
        pipeline = beam.Pipeline()
        _ = (pipeline
             | beam.Create(['a', 'b'])
             | beam.ExternalTransform(
                 'beam:transforms:xlang:test:prefix',
                 ImplicitSchemaPayloadBuilder({'data': u'0'}),
                 expansion_service.ExpansionServiceServicer()))

        proto, _ = pipeline.to_runner_api(return_context=True)
        pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner,
                                                       pipeline._options)

        # Original pipeline has the un-expanded external transform
        self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts)

        # new pipeline has the expanded external transform
        self.assertNotEqual(
            [], pipeline_from_proto.transforms_stack[0].parts[1].parts)
        self.assertEqual(
            u'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel',
            pipeline_from_proto.transforms_stack[0].parts[1].parts[0].
            full_label)
 def to_runner_api_parameter(self, unused_context):
     return TEST_PREFIX_URN, ImplicitSchemaPayloadBuilder({
         'data':
         self._payload
     }).payload()
Example #18
0
 def test_implicit_payload_builder(self):
     builder = ImplicitSchemaPayloadBuilder(PayloadBase.values)
     result = builder.build()
     expected = get_payload(PayloadBase.args)
     self.assertEqual(result, expected)