def test_no_annotations(self): def fn(a: int) -> int: return a with self.assertRaisesRegex(TypeCheckError, r'requires .*int.* but got .*str'): _ = ['a', 'b', 'c'] | Map(fn) # Same pipeline doesn't raise without annotations on fn. fn = decorators.no_annotations(fn) _ = ['a', 'b', 'c'] | Map(fn)
def test_origin(self): def annotated(e: str) -> str: return e t = Map(annotated) th = t.get_type_hints() th = th.with_input_types(str) self.assertRegex(th.debug_str(), r'with_input_types') th = th.with_output_types(str) self.assertRegex( th.debug_str(), r'(?s)with_output_types.*with_input_types.*Map.annotated')
def test_no_annotations(self): def fn(a: int) -> int: return a _ = [1, 2, 3] | Map(fn) # Doesn't raise - correct types. with self.assertRaisesRegex(TypeCheckError, r'requires .*int.* but got .*str'): _ = ['a', 'b', 'c'] | Map(fn) @decorators.no_annotations def fn2(a: int) -> int: return a _ = ['a', 'b', 'c'] | Map(fn2) # Doesn't raise - no input type hints.
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part import base64 return {'data': base64.b64encode(record[1])} with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.input_dataset, table=self.input_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def test_fixed_after_count_accumulating(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k1', 1)]) .advance_watermark_to(2) .add_elements([('k1', 2), ('k2', 2)]) # This values are discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(2), triggerfn=Repeatedly(AfterCount(2)), accumulation_mode=AccumulationMode.ACCUMULATING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 2), [1, 1]), ('k2', IntervalWindow(0, 2), [1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]), ]))
def test(self): def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid # pylint: disable=reimported from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.TEST_DATABASE, max_batch_size_bytes=5120))
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test_data', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.spanner_database, max_batch_size_bytes=5120))
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
def test_external_transforms(self): # TODO Move expansion address resides into PipelineOptions def get_expansion_service(): return "localhost:" + str(self.expansion_port) with self.create_pipeline() as p: res = (p | GenerateSequence( start=1, stop=10, expansion_service=get_expansion_service())) assert_that(res, equal_to([i for i in range(1, 10)])) # We expect to fail here because we do not have a Kafka cluster handy. # Nevertheless, we check that the transform is expanded by the # ExpansionService and that the pipeline fails during execution. with self.assertRaises(Exception) as ctx: with self.create_pipeline() as p: # pylint: disable=expression-not-assigned (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': 'notvalid1:7777, notvalid2:3531' }, topics=['topic1', 'topic2'], key_deserializer='org.apache.kafka.' 'common.serialization.' 'ByteArrayDeserializer', value_deserializer='org.apache.kafka.' 'common.serialization.' 'LongDeserializer', expansion_service=get_expansion_service())) self.assertTrue( 'No resolvable bootstrap urls given in bootstrap.servers' in str(ctx.exception), 'Expected to fail due to invalid bootstrap.servers, but ' 'failed due to:\n%s' % str(ctx.exception)) # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned (self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka(producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=get_expansion_service()))
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | GenerateSequence( 0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE) | Map(lambda x: logging.info(x))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project p = Pipeline(options=options) (p | Create(EN_TEXTS) | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE)) | Map(print_translation)) p.run()
def main(argv=None): options = PipelineOptions(argv) kafka_options = options.view_as(KafkaReadOptions) p = Pipeline(options=options) (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': kafka_options.bootstrap_servers }, topics=[kafka_options.topic]) | Map(lambda x: logging.info('kafka element: %s', x))) p.run()
def expand(self, xs): def as_dict(x): d = JSONDict(**x._asdict()) return d def encode_datetimes_to_s(x): for field in ['timestamp']: x[field] = (x[field].replace(tzinfo=pytz.utc) - epoch).total_seconds() # logging.info("Encoded: %s", str(x)) return x dataset, table = self.table.split('.') sink = WriteToBigQueryDatePartitioned( temp_gcs_location=self.temp_location, dataset=dataset, table=table, project=self.project, write_disposition="WRITE_TRUNCATE", schema=build_event_schema() ) logging.info('sink params: \n\t%s\n\t%s\n\t%s\n\t%s', self.temp_location, dataset, table, self.project) return (xs | Map(as_dict) | Map(encode_datetimes_to_s) | Map(lambda x: TimestampedValue(x, x['timestamp'])) | sink )
def test_sink_transform(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_sink_transform(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def run(options): visit_args = options.view_as(PortVisitsOptions) cloud_args = options.view_as(GoogleCloudOptions) p = beam.Pipeline(options=options) start_date = datetime.datetime.strptime( visit_args.start_date, '%Y-%m-%d').replace(tzinfo=pytz.utc) start_window = start_date - datetime.timedelta( days=visit_args.start_padding) end_date = datetime.datetime.strptime(visit_args.end_date, '%Y-%m-%d').replace(tzinfo=pytz.utc) dataset, table = visit_args.output_table.split('.') sink = WriteToBigQueryDatePartitioned( temp_gcs_location=cloud_args.temp_location, dataset=dataset, table=table, project=cloud_args.project, write_disposition="WRITE_TRUNCATE", schema=build_visit_schema()) queries = VisitEvent.create_queries(visit_args.events_table, start_window, end_date) sources = [(p | "Read_{}".format(i) >> beam.io.Read( beam.io.gcp.bigquery.BigQuerySource(query=x))) for (i, x) in enumerate(queries)] tagged_records = (sources | beam.Flatten() | beam.Map(from_msg) | CreatePortVisits() | "FilterVisits" >> Filter(lambda x: start_date.date( ) <= x.end_timestamp.date() <= end_date.date()) | Map(lambda x: TimestampedValue( visit_to_msg(x), _datetime_to_s(x.end_timestamp))) | sink) result = p.run() success_states = set( [PipelineState.DONE, PipelineState.RUNNING, PipelineState.UNKNOWN]) logging.info('returning with result.state=%s' % result.state) return 0 if result.state in success_states else 1
def test_expand_kafka_write(self): # We just test the expansion but do not execute. # pylint: disable=expression-not-assigned (self.create_pipeline() | Impulse() | Map(lambda input: (1, input)) | WriteToKafka(producer_config={ 'bootstrap.servers': 'localhost:9092, notvalid2:3531' }, topic='topic1', key_serializer='org.apache.kafka.' 'common.serialization.' 'LongSerializer', value_serializer='org.apache.kafka.' 'common.serialization.' 'ByteArraySerializer', expansion_service=self.get_expansion_service()))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project assert project is not None, '"project" is not specified.' source_code = 'en-US' target_code = 'ja' texts = ['Hello', 'Thank you', 'Goodbye'] p = Pipeline(options=options) (p | 'Texts' >> Create(texts) | 'Translate' >> ParDo(Translate(project, source_code, target_code)) | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1]))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS) # Big batch size with 1 minute trigger | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1])))) run = p.run() run.wait_until_finish()
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def test_buffering_timer_in_fixed_window_streaming(self): window_duration = 6 max_buffering_duration_secs = 100 start_time = timestamp.Timestamp(0) test_stream = ( TestStream().add_elements([ TimestampedValue(value, start_time + i) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()) ]).advance_processing_time(150).advance_watermark_to( start_time + window_duration).advance_watermark_to( start_time + window_duration + 1).advance_watermark_to_infinity()) with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | "fixed window" >> WindowInto(FixedWindows(window_duration)) | util.GroupIntoBatches( GroupIntoBatchesTest.BATCH_SIZE, max_buffering_duration_secs, fake_clock) | "count elements in batch" >> Map(lambda x: (None, len(x[1]))) | "global window" >> WindowInto(GlobalWindows()) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # Window duration is 6 and batch size is 5, so output batch size # should be 5 (flush because of batch size reached). expected_0 = 5 # There is only one element left in the window so batch size # should be 1 (flush because of max buffering duration reached). expected_1 = 1 # Collection has 10 elements, there are only 4 left, so batch size should # be 4 (flush because of end of window reached). expected_2 = 4 assert_that( num_elements_per_batch, equal_to([expected_0, expected_1, expected_2]), "assert2")
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} # pylint: disable=expression-not-assigned (self.pipeline | 'ProduceRows' >> Read( SyntheticSource(self.parseTestPipelineOptions())) | 'Format' >> Map(format_record) | 'WriteToBigQuery' >> WriteToBigQuery( self.output_dataset + '.' + self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_EMPTY))
def test_sliding_windows_simple_watermark(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k2', 1)]) .advance_watermark_to(1) .add_elements([('k1', 2), ('k2', 2)]) .add_elements([('k1', 2), ('k2', 2)]) .advance_watermark_to(2) .add_elements([('k1', 3), ('k2', 3)]) .add_elements([('k1', 3), ('k2', 3)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(SlidingWindows(2, 1)) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(-1, 1), [1, 1, 1]), ('k2', IntervalWindow(-1, 1), [1, 1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k1', IntervalWindow(2, 4), [3, 3]), ('k2', IntervalWindow(2, 4), [3, 3]), ]))
def test_sink_transform_compressed(self, compression_type): if compression_type == 'lz4' and ARROW_MAJOR_VERSION == 1: return unittest.skip( "Writing with LZ4 compression is not supported in " "pyarrow 1.x") with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA, codec=compression_type, num_shards=1, shard_name_template='') with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path + '*') \ | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def main(argv=None): options = PipelineOptions(argv) topic = options.view_as(PubSubTopicOptions).topic p = Pipeline(options=options) (p # This is an external transform # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used # for most cases. # # If you set expansion_service as BeamJarExpansionService( # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will # fail as the beam jar has no dependency for DirectRunner. As a workaround, # specify custom expansion service jar in this project. | ReadFromPubSub(topic=topic, with_attributes=True, expansion_service=expansion_service(options)) | Map(lambda message: logging.info("message: %s", message))) p.run()
def test_sink_transform_compliant_nested_type(self): if ARROW_MAJOR_VERSION < 4: return unittest.skip( 'Writing with compliant nested type is only ' 'supported in pyarrow 4.x and above') with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + 'tmp_filename') with TestPipeline() as p: _ = p \ | Create(self.RECORDS_NESTED) \ | WriteToParquet( path, self.SCHEMA_NESTED, num_shards=1, shard_name_template='', use_compliant_nested_type=True) with TestPipeline() as p: # json used for stable sortability readback = \ p \ | ReadFromParquet(path) \ | Map(json.dumps) assert_that( readback, equal_to([json.dumps(r) for r in self.RECORDS_NESTED]))
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def test(self): SCHEMA = parse_table_schema_from_json( '{"fields": [{"name": "data", "type": "BYTES"}]}') def format_record(record): # Since Synthetic Source returns data as a dictionary, we should skip one # of the part return {'data': base64.b64encode(record[1])} ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to BigQuery' >> WriteToBigQuery( dataset=self.output_dataset, table=self.output_table, schema=SCHEMA, create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_TRUNCATE))