Ejemplo n.º 1
0
    def test_no_annotations(self):
        def fn(a: int) -> int:
            return a

        with self.assertRaisesRegex(TypeCheckError,
                                    r'requires .*int.* but got .*str'):
            _ = ['a', 'b', 'c'] | Map(fn)

        # Same pipeline doesn't raise without annotations on fn.
        fn = decorators.no_annotations(fn)
        _ = ['a', 'b', 'c'] | Map(fn)
Ejemplo n.º 2
0
    def test_origin(self):
        def annotated(e: str) -> str:
            return e

        t = Map(annotated)
        th = t.get_type_hints()
        th = th.with_input_types(str)
        self.assertRegex(th.debug_str(), r'with_input_types')
        th = th.with_output_types(str)
        self.assertRegex(
            th.debug_str(),
            r'(?s)with_output_types.*with_input_types.*Map.annotated')
Ejemplo n.º 3
0
    def test_no_annotations(self):
        def fn(a: int) -> int:
            return a

        _ = [1, 2, 3] | Map(fn)  # Doesn't raise - correct types.

        with self.assertRaisesRegex(TypeCheckError,
                                    r'requires .*int.* but got .*str'):
            _ = ['a', 'b', 'c'] | Map(fn)

        @decorators.no_annotations
        def fn2(a: int) -> int:
            return a

        _ = ['a', 'b', 'c'] | Map(fn2)  # Doesn't raise - no input type hints.
Ejemplo n.º 4
0
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            import base64
            return {'data': base64.b64encode(record[1])}

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Write to BigQuery' >> WriteToBigQuery(
                    dataset=self.input_dataset,
                    table=self.input_table,
                    schema=SCHEMA,
                    create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=BigQueryDisposition.WRITE_EMPTY))
def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))
Ejemplo n.º 6
0
    def test_fixed_after_count_accumulating(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k1', 1)])
              .advance_watermark_to(2)
              .add_elements([('k1', 2), ('k2', 2)])  # This values are discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(2),
                              triggerfn=Repeatedly(AfterCount(2)),
                              accumulation_mode=AccumulationMode.ACCUMULATING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 2), [1, 1]),
                    ('k2', IntervalWindow(0, 2), [1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]),
                ]))
    def test(self):
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid  # pylint: disable=reimported
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Make mutations' >> FlatMap(make_insert_mutations)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to Spanner' >> WriteToSpanner(
                project_id=self.project,
                instance_id=self.spanner_instance,
                database_id=self.TEST_DATABASE,
                max_batch_size_bytes=5120))
Ejemplo n.º 8
0
    def _create_input_data(self):
        """
    Runs an additional pipeline which creates test data and waits for its
    completion.
    """
        def format_record(record):
            import base64
            return base64.b64encode(record[1])

        def make_insert_mutations(element):
            import uuid
            from apache_beam.io.gcp.experimental.spannerio import WriteMutation
            ins_mutation = WriteMutation.insert(table='test_data',
                                                columns=('id', 'data'),
                                                values=[(str(uuid.uuid1()),
                                                         element)])
            return [ins_mutation]

        with TestPipeline() as p:
            (  # pylint: disable=expression-not-assigned
                p
                | 'Produce rows' >> Read(
                    SyntheticSource(self.parse_synthetic_source_options()))
                | 'Format' >> Map(format_record)
                | 'Make mutations' >> FlatMap(make_insert_mutations)
                | 'Write to Spanner' >> WriteToSpanner(
                    project_id=self.project,
                    instance_id=self.spanner_instance,
                    database_id=self.spanner_database,
                    max_batch_size_bytes=5120))
Ejemplo n.º 9
0
 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
Ejemplo n.º 10
0
        def test_external_transforms(self):
            # TODO Move expansion address resides into PipelineOptions
            def get_expansion_service():
                return "localhost:" + str(self.expansion_port)

            with self.create_pipeline() as p:
                res = (p
                       | GenerateSequence(
                           start=1,
                           stop=10,
                           expansion_service=get_expansion_service()))

                assert_that(res, equal_to([i for i in range(1, 10)]))

            # We expect to fail here because we do not have a Kafka cluster handy.
            # Nevertheless, we check that the transform is expanded by the
            # ExpansionService and that the pipeline fails during execution.
            with self.assertRaises(Exception) as ctx:
                with self.create_pipeline() as p:
                    # pylint: disable=expression-not-assigned
                    (p
                     |
                     ReadFromKafka(consumer_config={
                         'bootstrap.servers':
                         'notvalid1:7777, notvalid2:3531'
                     },
                                   topics=['topic1', 'topic2'],
                                   key_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'ByteArrayDeserializer',
                                   value_deserializer='org.apache.kafka.'
                                   'common.serialization.'
                                   'LongDeserializer',
                                   expansion_service=get_expansion_service()))
            self.assertTrue(
                'No resolvable bootstrap urls given in bootstrap.servers'
                in str(ctx.exception),
                'Expected to fail due to invalid bootstrap.servers, but '
                'failed due to:\n%s' % str(ctx.exception))

            # We just test the expansion but do not execute.
            # pylint: disable=expression-not-assigned
            (self.create_pipeline()
             | Impulse()
             | Map(lambda input: (1, input))
             | WriteToKafka(producer_config={
                 'bootstrap.servers':
                 'localhost:9092, notvalid2:3531'
             },
                            topic='topic1',
                            key_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'LongSerializer',
                            value_serializer='org.apache.kafka.'
                            'common.serialization.'
                            'ByteArraySerializer',
                            expansion_service=get_expansion_service()))
Ejemplo n.º 11
0
def main(argv=None):
    options = PipelineOptions(argv)
    p = Pipeline(options=options)

    (p
     | GenerateSequence(
         0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE)
     | Map(lambda x: logging.info(x)))

    p.run()
Ejemplo n.º 12
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True
    project = options.view_as(GoogleCloudOptions).project

    p = Pipeline(options=options)
    (p
     | Create(EN_TEXTS)
     | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE,
                           TARGET_LANGUAGE_CODE))
     | Map(print_translation))

    p.run()
Ejemplo n.º 13
0
def main(argv=None):
    options = PipelineOptions(argv)
    kafka_options = options.view_as(KafkaReadOptions)

    p = Pipeline(options=options)
    (p
     | ReadFromKafka(consumer_config={
         'bootstrap.servers': kafka_options.bootstrap_servers
     },
                     topics=[kafka_options.topic])
     | Map(lambda x: logging.info('kafka element: %s', x)))

    p.run()
Ejemplo n.º 14
0
    def expand(self, xs):

        def as_dict(x):
            d = JSONDict(**x._asdict())
            return d

        def encode_datetimes_to_s(x):

            for field in ['timestamp']:
                x[field] = (x[field].replace(tzinfo=pytz.utc) - epoch).total_seconds()

            # logging.info("Encoded: %s", str(x))

            return x


        dataset, table = self.table.split('.')


        sink = WriteToBigQueryDatePartitioned(
            temp_gcs_location=self.temp_location,
            dataset=dataset,
            table=table,
            project=self.project,
            write_disposition="WRITE_TRUNCATE",
            schema=build_event_schema()
            )


        logging.info('sink params: \n\t%s\n\t%s\n\t%s\n\t%s', self.temp_location, dataset, table, self.project)

        return (xs 
            | Map(as_dict)
            | Map(encode_datetimes_to_s)
            | Map(lambda x: TimestampedValue(x, x['timestamp'])) 
            | sink
            )
Ejemplo n.º 15
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Ejemplo n.º 16
0
 def test_sink_transform(self):
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def run(options):

    visit_args = options.view_as(PortVisitsOptions)
    cloud_args = options.view_as(GoogleCloudOptions)

    p = beam.Pipeline(options=options)

    start_date = datetime.datetime.strptime(
        visit_args.start_date, '%Y-%m-%d').replace(tzinfo=pytz.utc)
    start_window = start_date - datetime.timedelta(
        days=visit_args.start_padding)
    end_date = datetime.datetime.strptime(visit_args.end_date,
                                          '%Y-%m-%d').replace(tzinfo=pytz.utc)

    dataset, table = visit_args.output_table.split('.')

    sink = WriteToBigQueryDatePartitioned(
        temp_gcs_location=cloud_args.temp_location,
        dataset=dataset,
        table=table,
        project=cloud_args.project,
        write_disposition="WRITE_TRUNCATE",
        schema=build_visit_schema())

    queries = VisitEvent.create_queries(visit_args.events_table, start_window,
                                        end_date)

    sources = [(p | "Read_{}".format(i) >> beam.io.Read(
        beam.io.gcp.bigquery.BigQuerySource(query=x)))
               for (i, x) in enumerate(queries)]

    tagged_records = (sources
                      | beam.Flatten()
                      | beam.Map(from_msg)
                      | CreatePortVisits()
                      | "FilterVisits" >> Filter(lambda x: start_date.date(
                      ) <= x.end_timestamp.date() <= end_date.date())
                      | Map(lambda x: TimestampedValue(
                          visit_to_msg(x), _datetime_to_s(x.end_timestamp)))
                      | sink)

    result = p.run()

    success_states = set(
        [PipelineState.DONE, PipelineState.RUNNING, PipelineState.UNKNOWN])

    logging.info('returning with result.state=%s' % result.state)
    return 0 if result.state in success_states else 1
Ejemplo n.º 18
0
 def test_expand_kafka_write(self):
     # We just test the expansion but do not execute.
     # pylint: disable=expression-not-assigned
     (self.create_pipeline()
      | Impulse()
      | Map(lambda input: (1, input))
      | WriteToKafka(producer_config={
          'bootstrap.servers': 'localhost:9092, notvalid2:3531'
      },
                     topic='topic1',
                     key_serializer='org.apache.kafka.'
                     'common.serialization.'
                     'LongSerializer',
                     value_serializer='org.apache.kafka.'
                     'common.serialization.'
                     'ByteArraySerializer',
                     expansion_service=self.get_expansion_service()))
Ejemplo n.º 19
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    project = options.view_as(GoogleCloudOptions).project
    assert project is not None, '"project" is not specified.'

    source_code = 'en-US'
    target_code = 'ja'
    texts = ['Hello', 'Thank you', 'Goodbye']

    p = Pipeline(options=options)
    (p
     | 'Texts' >> Create(texts)
     | 'Translate' >> ParDo(Translate(project, source_code, target_code))
     | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1])))

    p.run()
Ejemplo n.º 20
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()
Ejemplo n.º 21
0
    def test_fixed_windows_simple_watermark(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0),
                             tsv('k1', 2, 0), tsv('k2', 2, 0)])
              .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)])
              .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)])
              .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)])
              .advance_watermark_to(1)
              .add_elements([tsv('k1', 6, 0)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(1),
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k2', IntervalWindow(0, 1), [1, 2,
                                                  3]),  # On the watermark
                    ('k1', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k2', IntervalWindow(1, 2), [4, 5]),  # On the watermark
                    ('k1', IntervalWindow(0, 1), [6]),  # After the watermark
                ]))
Ejemplo n.º 22
0
  def test_buffering_timer_in_fixed_window_streaming(self):
    window_duration = 6
    max_buffering_duration_secs = 100

    start_time = timestamp.Timestamp(0)
    test_stream = (
        TestStream().add_elements([
            TimestampedValue(value, start_time + i) for i,
            value in enumerate(GroupIntoBatchesTest._create_test_data())
        ]).advance_processing_time(150).advance_watermark_to(
            start_time + window_duration).advance_watermark_to(
                start_time + window_duration +
                1).advance_watermark_to_infinity())

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0).
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | "fixed window" >> WindowInto(FixedWindows(window_duration))
          | util.GroupIntoBatches(
              GroupIntoBatchesTest.BATCH_SIZE,
              max_buffering_duration_secs,
              fake_clock)
          | "count elements in batch" >> Map(lambda x: (None, len(x[1])))
          | "global window" >> WindowInto(GlobalWindows())
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # Window duration is 6 and batch size is 5, so output batch size
      # should be 5 (flush because of batch size reached).
      expected_0 = 5
      # There is only one element left in the window so batch size
      # should be 1 (flush because of max buffering duration reached).
      expected_1 = 1
      # Collection has 10 elements, there are only 4 left, so batch size should
      # be 4 (flush because of end of window reached).
      expected_2 = 4
      assert_that(
          num_elements_per_batch,
          equal_to([expected_0, expected_1, expected_2]),
          "assert2")
Ejemplo n.º 23
0
    def test_sessions_and_complex_trigger_accumulating(self):
        def tsv(key, value, ts):
            return TimestampedValue((key, value), timestamp=ts)

        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15),
                             tsv('k1', 3, 7), tsv('k1', 4, 30)])
              .advance_watermark_to(50)
              .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),])
              .add_elements([tsv('k1', -1, 21)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(Sessions(10),
                              triggerfn=AfterWatermark(early=AfterCount(2),
                                                       late=AfterCount(1)),
                              accumulation_mode=AccumulationMode.ACCUMULATING,
                              allowed_lateness=MAX_TIMESTAMP.seconds())

        with TestPipeline() as p:
            result = (p
                      | test_stream
                      | WindowInto(windowing.windowfn)
                      | ParDo(trigger_manager._ReifyWindows())
                      | ParDo(trigger_manager._GroupBundlesByKey())
                      | ParDo(
                          trigger_manager.GeneralTriggerManagerDoFn(windowing))
                      | Map(lambda elm: (elm[0], elm[1][0].windows[0],
                                         set(v.value for v in elm[1]))))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # early
                    ('k1', IntervalWindow(1, 25), {1, 2, 3}),  # on time
                    ('k1', IntervalWindow(30, 40), {4}),  # on time
                    ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}),  # late
                    ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2,
                                                   -1}),  # late
                ]))
Ejemplo n.º 24
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        # pylint: disable=expression-not-assigned
        (self.pipeline
         | 'ProduceRows' >> Read(
             SyntheticSource(self.parseTestPipelineOptions()))
         | 'Format' >> Map(format_record)
         | 'WriteToBigQuery' >> WriteToBigQuery(
             self.output_dataset + '.' + self.output_table,
             schema=SCHEMA,
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_EMPTY))
Ejemplo n.º 25
0
    def test_sliding_windows_simple_watermark(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k2', 1)])
              .advance_watermark_to(1)
              .add_elements([('k1', 2), ('k2', 2)])
              .add_elements([('k1', 2), ('k2', 2)])
              .advance_watermark_to(2)
              .add_elements([('k1', 3), ('k2', 3)])
              .add_elements([('k1', 3), ('k2', 3)])
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(SlidingWindows(2, 1))

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k2', IntervalWindow(-1, 1), [1, 1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]),
                    ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]),
                    ('k1', IntervalWindow(2, 4), [3, 3]),
                    ('k2', IntervalWindow(2, 4), [3, 3]),
                ]))
Ejemplo n.º 26
0
 def test_sink_transform_compressed(self, compression_type):
   if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1:
     return unittest.skip(
         "Writing with LZ4 compression is not supported in "
         "pyarrow 1.x")
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + "tmp_filename")
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS) \
       | WriteToParquet(
           path, self.SCHEMA, codec=compression_type,
           num_shards=1, shard_name_template='')
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path + '*') \
           | Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Ejemplo n.º 27
0
def main(argv=None):
    options = PipelineOptions(argv)
    topic = options.view_as(PubSubTopicOptions).topic

    p = Pipeline(options=options)
    (p
     # This is an external transform
     # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from
     # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used
     # for most cases.
     #
     # If you set expansion_service as BeamJarExpansionService(
     # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will
     # fail as the beam jar has no dependency for DirectRunner. As a workaround,
     # specify custom expansion service jar in this project.
     | ReadFromPubSub(topic=topic,
                      with_attributes=True,
                      expansion_service=expansion_service(options))
     | Map(lambda message: logging.info("message: %s", message)))
    p.run()
Ejemplo n.º 28
0
 def test_sink_transform_compliant_nested_type(self):
   if ARROW_MAJOR_VERSION < 4:
     return unittest.skip(
         'Writing with compliant nested type is only '
         'supported in pyarrow 4.x and above')
   with TemporaryDirectory() as tmp_dirname:
     path = os.path.join(tmp_dirname + 'tmp_filename')
     with TestPipeline() as p:
       _ = p \
       | Create(self.RECORDS_NESTED) \
       | WriteToParquet(
           path, self.SCHEMA_NESTED, num_shards=1,
           shard_name_template='', use_compliant_nested_type=True)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = \
           p \
           | ReadFromParquet(path) \
           | Map(json.dumps)
       assert_that(
           readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))
Ejemplo n.º 29
0
  def test_buffering_timer_in_global_window_streaming(self):
    max_buffering_duration_secs = 42

    start_time = timestamp.Timestamp(0)
    test_stream = TestStream().advance_watermark_to(start_time)
    for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
      test_stream.add_elements(
          [TimestampedValue(value, start_time + i)]) \
        .advance_processing_time(5)
    test_stream.advance_watermark_to(
        start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
      .advance_watermark_to_infinity()

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # Set a batch size larger than the total number of elements.
      # Since we're in a global window, we would have been waiting
      # for all the elements to arrive without the buffering time limit.
      batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2

      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0). Since the fake clock never really advances during
      # the pipeline execution, meaning that the timer is always set to the same
      # value, the timer will be fired on every element after the first firing.
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | WindowInto(
              GlobalWindows(),
              trigger=Repeatedly(AfterCount(1)),
              accumulation_mode=trigger.AccumulationMode.DISCARDING)
          | util.GroupIntoBatches(
              batch_size, max_buffering_duration_secs, fake_clock)
          | 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # We will flush twice when the max buffering duration is reached and when
      # the global window ends.
      assert_that(num_elements_per_batch, equal_to([9, 1]))
Ejemplo n.º 30
0
    def test(self):
        SCHEMA = parse_table_schema_from_json(
            '{"fields": [{"name": "data", "type": "BYTES"}]}')

        def format_record(record):
            # Since Synthetic Source returns data as a dictionary, we should skip one
            # of the part
            return {'data': base64.b64encode(record[1])}

        (  # pylint: disable=expression-not-assigned
            self.pipeline
            | 'Produce rows' >> Read(
                SyntheticSource(self.parse_synthetic_source_options()))
            | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace))
            | 'Format' >> Map(format_record)
            | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace))
            | 'Write to BigQuery' >> WriteToBigQuery(
                dataset=self.output_dataset,
                table=self.output_table,
                schema=SCHEMA,
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_TRUNCATE))