예제 #1
0
 def expand(self, pcoll):
     # We must have at least a single element to ensure the matcher
     # code gets run even if the input pcollection is empty.
     keyed_singleton = pcoll.pipeline | Create([(None, None)])
     keyed_actual = (pcoll
                     | WindowInto(window.GlobalWindows())
                     | "ToVoidKey" >> Map(lambda v: (None, v)))
     _ = ((keyed_singleton, keyed_actual)
          | "Group" >> CoGroupByKey()
          |
          "Unkey" >> Map(lambda (k, (_, actual_values)): actual_values)
          | "Match" >> Map(matcher))
예제 #2
0
    def expand(self, pcoll):
      if reify_windows:
        pcoll = pcoll | ParDo(ReifyTimestampWindow())

      # We must have at least a single element to ensure the matcher
      # code gets run even if the input pcollection is empty.
      keyed_singleton = pcoll.pipeline | Create([(None, None)])
      keyed_actual = (
          pcoll
          | WindowInto(window.GlobalWindows())
          | "ToVoidKey" >> Map(lambda v: (None, v)))
      _ = ((keyed_singleton, keyed_actual)
           | "Group" >> CoGroupByKey()
           | "Unkey" >> Map(lambda k___actual_values: k___actual_values[1][1])
           | "Match" >> Map(matcher))
예제 #3
0
        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])
            keyed_actual = (pcoll
                            | WindowInto(custom_windowing
                                         or window.GlobalWindows())
                            | "ToVoidKey" >> Map(lambda v: (None, v)))
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if custom_windowing:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)
예제 #4
0
        def expand(self, pcoll):
            if reify_windows:
                pcoll = pcoll | ParDo(ReifyTimestampWindow())

            keyed_singleton = pcoll.pipeline | Create([(None, None)])

            if use_global_window:
                pcoll = pcoll | WindowInto(window.GlobalWindows())

            keyed_actual = pcoll | "ToVoidKey" >> Map(lambda v: (None, v))

            # This is a CoGroupByKey so that the matcher always runs, even if the
            # PCollection is empty.
            plain_actual = ((keyed_singleton, keyed_actual)
                            | "Group" >> CoGroupByKey()
                            | "Unkey" >> Map(lambda k_values: k_values[1][1]))

            if not use_global_window:
                plain_actual = plain_actual | "AddWindow" >> ParDo(AddWindow())

            plain_actual = plain_actual | "Match" >> Map(matcher)
예제 #5
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=range-builtin-not-iterating
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=range-builtin-not-iterating
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        fastavro_output = '/'.join([self.output, 'fastavro'])
        avro_output = '/'.join([self.output, 'avro'])

        self.addCleanup(delete_files, [self.output + '*'])

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            self.SCHEMA,
            use_fastavro=True
        )

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_avro' >> WriteToAvro(
            avro_output,
            self.SCHEMA,
            use_fastavro=False
        )

        result = self.test_pipeline.run()
        result.wait_until_finish()
        assert result.state == PipelineState.DONE

        fastavro_read_pipeline = TestPipeline(is_integration_test=True)

        fastavro_records = \
            fastavro_read_pipeline \
            | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
            | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
            | Map(lambda rec: (rec['number'], rec))

        avro_records = \
            fastavro_read_pipeline \
            | 'create-avro' >> Create(['%s*' % avro_output]) \
            | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
            | Map(lambda rec: (rec['number'], rec))

        def check(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(v.keys(), ['avro', 'fastavro'])
            avro_values = v['avro']
            fastavro_values = v['fastavro']
            assertEqual(avro_values, fastavro_values)
            assertEqual(len(avro_values), 1)

        # pylint: disable=expression-not-assigned
        {
            'avro': avro_records,
            'fastavro': fastavro_records
        } \
        | CoGroupByKey() \
        | Map(check)

        fastavro_read_pipeline.run().wait_until_finish()
        assert result.state == PipelineState.DONE
예제 #6
0
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000
        fastavro_output = '/'.join([self.output, 'fastavro'])

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=bad-option-value
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=bad-option-value
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            parse_schema(json.loads(self.SCHEMA_STRING)),
        )
        result = self.test_pipeline.run()
        result.wait_until_finish()
        fastavro_pcoll = self.test_pipeline \
                         | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
                         | 'read-fastavro' >> ReadAllFromAvro()

        mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map(
            lambda x: (x['number'], x))
        mapped_record_pcoll = records_pcoll | "map_record" >> Map(
            lambda x: (x['number'], x))

        def validate_record(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll'])
            record_pcoll_values = v['record_pcoll']
            fastavro_values = v['fastavro']
            assertEqual(record_pcoll_values, fastavro_values)
            assertEqual(len(record_pcoll_values), 1)

        {
            "record_pcoll": mapped_record_pcoll,
            "fastavro": mapped_fastavro_pcoll
        } | CoGroupByKey() | Map(validate_record)

        result = self.test_pipeline.run()
        result.wait_until_finish()

        self.addCleanup(delete_files, [self.output])
        assert result.state == PipelineState.DONE