Ejemplo n.º 1
0
 def test_pcolls_to_pcoll_id(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     # pylint: disable=range-builtin-not-iterating
     init_pcoll = p | 'Init Create' >> beam.Impulse()
     _, ctx = p.to_runner_api(use_fake_coders=True, return_context=True)
     self.assertEqual(instr.pcolls_to_pcoll_id(p, ctx),
                      {str(init_pcoll): 'ref_PCollection_PCollection_1'})
Ejemplo n.º 2
0
    def source(self, *labels):
        """Returns the StreamingCacheManager source.

    This is beam.Impulse() because unbounded sources will be marked with this
    and then the PipelineInstrument will replace these with a TestStream.
    """
        return beam.Impulse()
    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        positive | ParDo(ProcessNumbersFn(), AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
 def test_pcoll_to_pcoll_id(self):
     p = beam.Pipeline(interactive_runner.InteractiveRunner())
     ie.current_env().set_cache_manager(InMemoryCache(), p)
     # pylint: disable=bad-option-value
     init_pcoll = p | 'Init Create' >> beam.Impulse()
     _, ctx = p.to_runner_api(return_context=True)
     self.assertEqual(instr.pcoll_to_pcoll_id(p, ctx),
                      {str(init_pcoll): 'ref_PCollection_PCollection_1'})
Ejemplo n.º 5
0
 def expand(self, pbegin):
   assert isinstance(pbegin, pvalue.PBegin), (
       'Input to transform must be a PBegin but found %s' % pbegin)
   return (
       pbegin
       | 'Impulse' >> beam.Impulse()
       | 'GenerateKeys' >> beam.ParDo(
           StatefulLoadGenerator.GenerateKeys(self.num_keys, self.key_size))
       | 'GenerateLoad' >> beam.ParDo(
           StatefulLoadGenerator.GenerateLoad(
               self.num_records // self.num_keys, self.value_size)))
Ejemplo n.º 6
0
 def build_write_pipeline(self, pipeline):
     _ = (
         pipeline
         | 'Impulse' >> beam.Impulse()
         | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS))  # pylint: disable=range-builtin-not-iterating
         | 'Reshuffle' >> beam.Reshuffle()
         | 'MakeKV' >> beam.Map(lambda x:
                                (b'', str(x).encode())).with_output_types(
                                    typing.Tuple[bytes, bytes])
         | 'WriteToKafka' >> WriteToKafka(
             producer_config={'bootstrap.servers': self.bootstrap_servers},
             topic=self.topic,
             expansion_service=self.expansion_service))
Ejemplo n.º 7
0
def main(options):
    with beam.Pipeline(options=options) as pipe:
        (pipe
         | 'Impulse' >> beam.Impulse()
         | 'LocalEnv' >> GetEnv('FOO')
         | 'Local' >> Log(color='green')
         | 'RemoteEnv' >> EnvTransform(GetEnv, {'FOO': 'BAR'}, 'FOO')
         | 'Remote' >> Log(color='cyan')
         | 'AssertEnv' >> EnvTransform(AssertEnv, {'FOO': 'BAR'}, FOO='BAR')
         | 'ClsLvlNeedsEnv' >> NeedsEnv()
         # TODO:
         # | 'InstLvlNeedsEnv' >> NeedsEnv().with_env(FOO='BAZ')
         )
Ejemplo n.º 8
0
 def run_write_pipeline(
     self, num_rows, to_row_fn, row_type, spanner_transform=None):
   with TestPipeline(is_integration_test=True) as p:
     p.not_use_test_runner_api = True
     _ = (
         p
         | 'Impulse' >> beam.Impulse()
         | 'Generate' >> beam.FlatMap(lambda x: range(num_rows))  # pylint: disable=range-builtin-not-iterating
         | 'Map to row' >> beam.Map(to_row_fn).with_output_types(row_type)
         | 'Write to Spanner' >> spanner_transform(
             instance_id=self.instance_id,
             database_id=self.database_id,
             project_id=self.project_id,
             table=self.table,
             emulator_host=self.spanner_helper.get_emulator_host(),
         ))
Ejemplo n.º 9
0
def send(pipeline_options):

    with beam.Pipeline(options=pipeline_options) as pipe:
        (pipe
         | beam.Impulse()
         | beam.Map(lambda x: (1, x))
         | Log()
         # | 'KafkaWrite' >> WriteToKafka(
         #       producer_config={
         #           'bootstrap.servers': 'localhost:9092',
         #       },
         #       topic=TOPIC,
         #       # key_serializer='org.apache.kafka.common.serialization.ByteArraySerializer',
         #       # value_serializer='org.apache.kafka.common.serialization.ByteArraySerializer',
         #       expansion_service='localhost:8097',
         #   )
         )
Ejemplo n.º 10
0
  def test_visit_entire_graph(self):
    pipeline = Pipeline()
    pcoll1 = pipeline | 'pcoll' >> beam.Impulse()
    pcoll2 = pcoll1 | 'do1' >> FlatMap(lambda x: [x + 1])
    pcoll3 = pcoll2 | 'do2' >> FlatMap(lambda x: [x + 1])
    pcoll4 = pcoll2 | 'do3' >> FlatMap(lambda x: [x + 1])
    transform = PipelineTest.CustomTransform()
    pcoll5 = pcoll4 | transform

    visitor = PipelineTest.Visitor(visited=[])
    pipeline.visit(visitor)
    self.assertEqual({pcoll1, pcoll2, pcoll3, pcoll4, pcoll5},
                     set(visitor.visited))
    self.assertEqual(set(visitor.enter_composite), set(visitor.leave_composite))
    self.assertEqual(2, len(visitor.enter_composite))
    self.assertEqual(visitor.enter_composite[1].transform, transform)
    self.assertEqual(visitor.leave_composite[0].transform, transform)
Ejemplo n.º 11
0
    def test_pardo_with_unbounded_per_element_dofn(self):
        class UnboundedDoFn(beam.DoFn):
            @beam.DoFn.unbounded_per_element()
            def process(self, element):
                pass

        class BoundedDoFn(beam.DoFn):
            def process(self, element):
                pass

        with TestPipeline() as p:
            source = p | beam.Impulse()
            unbounded_pcoll = source | beam.ParDo(UnboundedDoFn())
            bounded_pcoll = source | beam.ParDo(BoundedDoFn())

            self.assertEqual(unbounded_pcoll.is_bounded, False)
            self.assertEqual(bounded_pcoll.is_bounded, True)
Ejemplo n.º 12
0
 def run_kinesis_write(self):
     with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
         p.not_use_test_runner_api = True
         _ = (
             p
             | 'Impulse' >> beam.Impulse()
             | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS))  # pylint: disable=bad-option-value
             | 'Map to bytes' >> beam.Map(lambda x: RECORD + str(x).encode(
             )).with_output_types(bytes)
             | 'WriteToKinesis' >> WriteToKinesis(
                 stream_name=self.aws_kinesis_stream,
                 aws_access_key=self.aws_access_key,
                 aws_secret_key=self.aws_secret_key,
                 region=self.aws_region,
                 service_endpoint=self.aws_service_endpoint,
                 verify_certificate=(not self.use_localstack),
                 partition_key='1',
                 producer_properties=self.producer_properties,
             ))
Ejemplo n.º 13
0
  def test_schema_autodetect_not_allowed_with_avro_file_loads(self):
    with TestPipeline() as p:
      pc = p | beam.Impulse()

      with self.assertRaisesRegex(ValueError, '^A schema must be provided'):
        _ = (
            pc
            | 'No Schema' >> beam.io.gcp.bigquery.WriteToBigQuery(
                "dataset.table",
                schema=None,
                temp_file_format=bigquery_tools.FileFormat.AVRO))

      with self.assertRaisesRegex(ValueError,
                                  '^Schema auto-detection is not supported'):
        _ = (
            pc
            | 'Schema Autodetected' >> beam.io.gcp.bigquery.WriteToBigQuery(
                "dataset.table",
                schema=beam.io.gcp.bigquery.SCHEMA_AUTODETECT,
                temp_file_format=bigquery_tools.FileFormat.AVRO))
Ejemplo n.º 14
0
    def test_passthrough(self):
        """
    Test that PTransforms which pass through their input PCollection can be
    used with PipelineInfo.
    """
        class Passthrough(beam.PTransform):
            def expand(self, pcoll):
                return pcoll

        p = beam.Pipeline(runner=self.runner)
        p | beam.Impulse() | Passthrough()  # pylint: disable=expression-not-assigned
        proto = to_stable_runner_api(p).components
        info = pipeline_analyzer.PipelineInfo(proto)
        for pcoll_id in info.all_pcollections():
            # FIXME: If PipelineInfo does not support passthrough PTransforms, this
            #        will only fail some of the time, depending on the ordering of
            #        transforms in the Pipeline proto.

            # Should not throw exception
            info.cache_label(pcoll_id)
    def test_root_transforms(self):
        root_read = beam.Impulse()
        root_flatten = Flatten(pipeline=self.pipeline)

        pbegin = pvalue.PBegin(self.pipeline)
        pcoll_read = pbegin | 'read' >> root_read
        pcoll_read | FlatMap(lambda x: x)
        [] | 'flatten' >> root_flatten

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]

        self.assertCountEqual(root_transforms, [root_read, root_flatten])

        pbegin_consumers = [
            c.transform for c in self.visitor.value_to_consumers[pbegin]
        ]
        self.assertCountEqual(pbegin_consumers, [root_read])
        self.assertEqual(len(self.visitor.step_names), 3)
Ejemplo n.º 16
0
    def test_side_inputs(self):
        class SplitNumbersFn(DoFn):
            def process(self, element):
                if element < 0:
                    yield pvalue.TaggedOutput('tag_negative', element)
                else:
                    yield element

        class ProcessNumbersFn(DoFn):
            def process(self, element, negatives):
                yield element

        def _process_numbers(pcoll, negatives):
            first_output = (pcoll
                            | 'process numbers step 1' >> ParDo(
                                ProcessNumbersFn(), negatives))

            second_output = (first_output
                             | 'process numbers step 2' >> ParDo(
                                 ProcessNumbersFn(), negatives))

            output_pc = ((first_output, second_output)
                         | 'flatten results' >> beam.Flatten())
            return output_pc

        root_read = beam.Impulse()

        result = (self.pipeline
                  | 'read' >> root_read
                  | ParDo(SplitNumbersFn()).with_outputs('tag_negative',
                                                         main='positive'))
        positive, negative = result
        _process_numbers(positive, AsList(negative))

        self.pipeline.visit(self.visitor)

        root_transforms = [t.transform for t in self.visitor.root_transforms]
        self.assertEqual(root_transforms, [root_read])
        self.assertEqual(len(self.visitor.step_names), 5)
        self.assertEqual(len(self.visitor.views), 1)
        self.assertTrue(isinstance(self.visitor.views[0], pvalue.AsList))
    def test_pardo_state_with_custom_key_coder(self):
        """Tests that state requests work correctly when the key coder is an
    SDK-specific coder, i.e. non standard coder. This is additionally enforced
    by Java's ProcessBundleDescriptorsTest and by Flink's
    ExecutableStageDoFnOperator which detects invalid encoding by checking for
    the correct key group of the encoded key."""
        index_state_spec = userstate.CombiningValueStateSpec('index', sum)

        # Test params
        # Ensure decent amount of elements to serve all partitions
        n = 200
        duplicates = 1

        split = n // (duplicates + 1)
        inputs = [(i % split, str(i % split)) for i in range(0, n)]

        # Use a DoFn which has to use FastPrimitivesCoder because the type cannot
        # be inferred
        class Input(beam.DoFn):
            def process(self, impulse):
                for i in inputs:
                    yield i

        class AddIndex(beam.DoFn):
            def process(self, kv,
                        index=beam.DoFn.StateParam(index_state_spec)):
                k, v = kv
                index.add(1)
                yield k, v, index.read()

        expected = [(i % split, str(i % split), i // split + 1)
                    for i in range(0, n)]

        with self.create_pipeline() as p:
            assert_that(
                p
                | beam.Impulse()
                | beam.ParDo(Input())
                | beam.ParDo(AddIndex()), equal_to(expected))
  def run_write(self):
    def user_data_mapper(test_row):
      return [
          str(test_row.number_column).encode('utf-8'),
          str(test_row.boolean_column).encode('utf-8'),
          binascii.hexlify(test_row.bytes_column),
      ]

    with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
      p.not_use_test_runner_api = True
      _ = (
          p
          | 'Impulse' >> beam.Impulse()
          | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS))  # pylint: disable=range-builtin-not-iterating
          | 'Map to TestRow' >> beam.Map(
              lambda num: TestRow(
                  num, num % 2 == 0, b"test" + str(num).encode()))
          | WriteToSnowflake(
              server_name=self.server_name,
              username=self.username,
              password=self.password,
              o_auth_token=self.o_auth_token,
              private_key_path=self.private_key_path,
              raw_private_key=self.raw_private_key,
              private_key_passphrase=self.private_key_passphrase,
              schema=self.schema,
              database=self.database,
              role=self.role,
              warehouse=self.warehouse,
              staging_bucket_name=self.staging_bucket_name,
              storage_integration_name=self.storage_integration_name,
              create_disposition=CreateDisposition.CREATE_IF_NEEDED,
              write_disposition=WriteDisposition.TRUNCATE,
              table_schema=SCHEMA_STRING,
              user_data_mapper=user_data_mapper,
              table=self.table,
              query=None,
              expansion_service=self.expansion_service,
          ))
Ejemplo n.º 19
0
    def test(self):
        class SequenceSideInputTestDoFn(beam.DoFn):
            """Iterate over first n side_input elements."""
            def __init__(self, first_n):
                self._first_n = first_n

            def process(self, unused_element, side_input):
                i = 0
                it = iter(side_input)
                while i < self._first_n:
                    i += 1
                    try:
                        # No-op. We only make sure that the element is accessed.
                        next(it)
                    except StopIteration:
                        return

        class MappingSideInputTestDoFn(beam.DoFn):
            """Take a sequence of keys as an additional side input and for each
      key in the sequence checks the value for key in the dictionary."""
            def process(self, unused_element, dict_side_input, keys_to_check):
                for key in keys_to_check:
                    # No-op. We only make sure that the element is accessed.
                    dict_side_input[key]

        class GetRandomKeys(beam.DoFn):
            def __init__(self, n):
                self._n = n

            def process(self, unused_element, dict_side_input):
                import random
                n = min(self._n, len(dict_side_input))
                return random.sample(dict_side_input.keys(), n)

        class AddEventTimestamps(beam.DoFn):
            """Assign timestamp to each element of PCollection."""
            def setup(self):
                self._timestamp = 0

            def process(self, element):
                from apache_beam.transforms.combiners import window
                yield window.TimestampedValue(element, self._timestamp)
                self._timestamp += 1

        input_pc = (self.pipeline
                    | 'Read synthetic' >> beam.io.Read(
                        SyntheticSource(self.parse_synthetic_source_options()))
                    | 'Collect start time metrics' >> beam.ParDo(
                        MeasureTime(self.metrics_namespace)))

        if self.side_input_size != self.input_options.get('num_records'):
            side_input = (
                input_pc
                | 'Sample {} elements'.format(self.side_input_size) >>
                beam.combiners.Sample.FixedSizeGlobally(self.side_input_size)
                | 'Flatten a sequence' >> beam.FlatMap(lambda x: x))
        else:
            side_input = input_pc

        if self.windows > 0:
            window_size = self.side_input_size / self.windows
            logging.info('Fixed windows of %s seconds will be applied',
                         window_size)
            side_input = (
                side_input
                | 'Add event timestamps' >> beam.ParDo(AddEventTimestamps())
                | 'Apply windows' >> beam.WindowInto(
                    beam.combiners.window.FixedWindows(window_size)))

        side_input_type = self.materialize_as()
        elements_to_access = self.side_input_size * self.access_percentage // 100
        logging.info(
            '%s out of %s total elements in the side input will be '
            'accessed.', elements_to_access, self.side_input_size)
        if side_input_type is beam.pvalue.AsDict:
            random_keys = (self.pipeline
                           | beam.Impulse()
                           | 'Get random keys' >> beam.ParDo(
                               GetRandomKeys(elements_to_access),
                               beam.pvalue.AsDict(side_input)))
            pc = input_pc | beam.ParDo(MappingSideInputTestDoFn(),
                                       side_input_type(side_input),
                                       beam.pvalue.AsList(random_keys))
        else:
            pc = input_pc | beam.ParDo(
                SequenceSideInputTestDoFn(elements_to_access),
                side_input_type(side_input))

        _ = pc | 'Collect end time metrics' >> beam.ParDo(
            MeasureTime(self.metrics_namespace))
Ejemplo n.º 20
0
 def test_impulse(self):
     with test_pipeline.TestPipeline(runner='BundleBasedDirectRunner') as p:
         assert_that(p | beam.Impulse(), equal_to([b'']))
Ejemplo n.º 21
0
def run(argv=None):
    options = PipelineOptions(argv)
    options.view_as(SetupOptions).save_main_session = True
    with Pipeline(options=options) as p:
        (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))