def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def test_model_setting_trigger(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements( ['a', 'a', 'a', 'b', 'b']).advance_watermark_to(70).advance_processing_time(600)) pcollection = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1))) counts = ( # [START model_setting_trigger] pcollection | WindowInto( FixedWindows(1 * 60), trigger=AfterProcessingTime(10 * 60), accumulation_mode=AccumulationMode.DISCARDING) # [END model_setting_trigger] | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 3), ('b', 2)]))
def test_model_early_late_triggers(self): pipeline_options = PipelineOptions() pipeline_options.view_as(StandardOptions).streaming = True with TestPipeline(options=pipeline_options) as p: test_stream = ( TestStream().advance_watermark_to(10).add_elements([ 'a', 'a', 'a', 'b', 'b' ]).add_elements([ TimestampedValue('a', 10) ]).advance_watermark_to(20).advance_processing_time(60).add_elements( [TimestampedValue('a', 10)])) trigger = ( # [START model_early_late_triggers] AfterWatermark( early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1)) # [END model_early_late_triggers] ) counts = ( p | test_stream | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | WindowInto( FixedWindows(15), trigger=trigger, allowed_lateness=20, accumulation_mode=AccumulationMode.DISCARDING) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1])))) assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
def test_fixed_after_count_accumulating(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k1', 1)]) .advance_watermark_to(2) .add_elements([('k1', 2), ('k2', 2)]) # This values are discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(2), triggerfn=Repeatedly(AfterCount(2)), accumulation_mode=AccumulationMode.ACCUMULATING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 2), [1, 1]), ('k2', IntervalWindow(0, 2), [1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]), ]))
def test_in_streaming_mode(self): timestamp_interval = 1 offset = itertools.count(0) start_time = timestamp.Timestamp(0) window_duration = 6 test_stream = ( TestStream().advance_watermark_to(start_time).add_elements([ TimestampedValue(x, next(offset) * timestamp_interval) for x in GroupIntoBatchesTest._create_test_data() ]).advance_watermark_to(start_time + (window_duration - 1)). advance_watermark_to(start_time + (window_duration + 1)). advance_watermark_to(start_time + GroupIntoBatchesTest.NUM_ELEMENTS ).advance_watermark_to_infinity()) pipeline = TestPipeline(options=StandardOptions(streaming=True)) # window duration is 6 and batch size is 5, so output batch size should be # 5 (flush because of batchSize reached) expected_0 = 5 # there is only one element left in the window so batch size should be 1 # (flush because of end of window reached) expected_1 = 1 # collection is 10 elements, there is only 4 left, so batch size should be # 4 (flush because end of collection reached) expected_2 = 4 collection = pipeline | test_stream \ | WindowInto(FixedWindows(window_duration)) \ | util.GroupIntoBatches(GroupIntoBatchesTest.BATCH_SIZE) num_elements_in_batches = collection | beam.Map(len) result = pipeline.run() result.wait_until_finish() assert_that(num_elements_in_batches, equal_to([expected_0, expected_1, expected_2]))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def test_buffering_timer_in_fixed_window_streaming(self): window_duration = 6 max_buffering_duration_secs = 100 start_time = timestamp.Timestamp(0) test_stream = ( TestStream().add_elements([ TimestampedValue(value, start_time + i) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()) ]).advance_processing_time(150).advance_watermark_to( start_time + window_duration).advance_watermark_to( start_time + window_duration + 1).advance_watermark_to_infinity()) with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | "fixed window" >> WindowInto(FixedWindows(window_duration)) | util.GroupIntoBatches( GroupIntoBatchesTest.BATCH_SIZE, max_buffering_duration_secs, fake_clock) | "count elements in batch" >> Map(lambda x: (None, len(x[1]))) | "global window" >> WindowInto(GlobalWindows()) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # Window duration is 6 and batch size is 5, so output batch size # should be 5 (flush because of batch size reached). expected_0 = 5 # There is only one element left in the window so batch size # should be 1 (flush because of max buffering duration reached). expected_1 = 1 # Collection has 10 elements, there are only 4 left, so batch size should # be 4 (flush because of end of window reached). expected_2 = 4 assert_that( num_elements_per_batch, equal_to([expected_0, expected_1, expected_2]), "assert2")
def expand(self, pcoll): return ( pcoll # Bind window info to each element using element timestamp (or publish time). | "Window into fixed intervals" >> WindowInto( FixedWindows(self.window_size)) | "Add timestamp to windowed elements" >> ParDo(AddTimestamp()) # Assign a random key to each windowed element based on the number of shards. | "Add key" >> WithKeys(lambda _: random.randint(0, self.num_shards - 1)) # Group windowed elements by key. All the elements in the same window must fit # memory for this. If not, you need to use `beam.util.BatchElements`. | "Group by key" >> GroupByKey())
def expand(self, input): # [START EXERCISE 2]: # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/ # Developer Docs: https://beam.apache.org/documentation/programming-guide/#windowing # Also: https://cloud.google.com/dataflow/model/windowing return (input # WindowInto() takes a WindowFn and returns a PTransform that applies windowing to the PCollection. # FixedWindows() returns a WindowFn that assigns elements to windows of a fixed number of milliseconds. # Use these methods to apply fixed windows of size self.duration to the PCollection. | WindowInto(FixedWindows(self.duration)) # Remember the ExtractAndSumScore PTransform from Exercise 1? # We parameterized it over the key field (see below). Use it here to compute the "team" scores | ExtractAndSumScore("team") # [END EXERCISE 2] )
def _pipeline_runner(): with beam.Pipeline(runner=DirectRunner()) as p: ts = TestStream().advance_watermark_to(0) all_elements = iter(range(size)) watermark = 0 while True: next_batch = list(itertools.islice(all_elements, 100)) if not next_batch: break ts = ts.add_elements([(i, random.randint(0, 1000)) for i in next_batch]) watermark = watermark + 100 ts = ts.advance_watermark_to(watermark) ts = ts.advance_watermark_to_infinity() input_pc = p | ts | WindowInto(FixedWindows(100)) for i in range(NUM_PARALLEL_STAGES): _build_serial_stages(input_pc, NUM_SERIAL_STAGES, i)
def test_fixed_windows_simple_watermark(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k2', 1, 0), tsv('k1', 2, 0), tsv('k2', 2, 0)]) .add_elements([tsv('k1', 3, 0), tsv('k2', 3, 0)]) .add_elements([tsv('k1', 4, 1), tsv('k2', 4, 1)]) .add_elements([tsv('k1', 5, 1), tsv('k2', 5, 1)]) .advance_watermark_to(1) .add_elements([tsv('k1', 6, 0)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k2', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ('k1', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k2', IntervalWindow(1, 2), [4, 5]), # On the watermark ('k1', IntervalWindow(0, 1), [6]), # After the watermark ]))
def test_sessions_and_complex_trigger_accumulating(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 1), tsv('k1', 2, 15), tsv('k1', 3, 7), tsv('k1', 4, 30)]) .advance_watermark_to(50) .add_elements([tsv('k1', -3, 1), tsv('k1', -2, 2),]) .add_elements([tsv('k1', -1, 21)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(Sessions(10), triggerfn=AfterWatermark(early=AfterCount(2), late=AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=MAX_TIMESTAMP.seconds()) with TestPipeline() as p: result = (p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo( trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], set(v.value for v in elm[1])))) assert_that( result, equal_to([ ('k1', IntervalWindow(1, 25), {1, 2, 3}), # early ('k1', IntervalWindow(1, 25), {1, 2, 3}), # on time ('k1', IntervalWindow(30, 40), {4}), # on time ('k1', IntervalWindow(1, 25), {1, 2, 3, -3, -2}), # late ('k1', IntervalWindow(1, 40), {1, 2, 3, 4, -3, -2, -1}), # late ]))
def test_sliding_windows_simple_watermark(self): # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([('k1', 1), ('k2', 1), ('k1', 1), ('k2', 1)]) .add_elements([('k1', 1), ('k2', 1)]) .advance_watermark_to(1) .add_elements([('k1', 2), ('k2', 2)]) .add_elements([('k1', 2), ('k2', 2)]) .advance_watermark_to(2) .add_elements([('k1', 3), ('k2', 3)]) .add_elements([('k1', 3), ('k2', 3)]) .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(SlidingWindows(2, 1)) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(-1, 1), [1, 1, 1]), ('k2', IntervalWindow(-1, 1), [1, 1, 1]), ('k1', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k2', IntervalWindow(0, 2), [1, 1, 1, 2, 2]), ('k1', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k2', IntervalWindow(1, 3), [2, 2, 3, 3]), ('k1', IntervalWindow(2, 4), [3, 3]), ('k2', IntervalWindow(2, 4), [3, 3]), ]))
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def test_with_trigger_window_that_finish(self): def tsv(key, value, ts): return TimestampedValue((key, value), timestamp=ts) # yapf: disable test_stream = ( TestStream() .advance_watermark_to(0) .add_elements([tsv('k1', 1, 0), tsv('k1', 2, 0)]) .add_elements([tsv('k1', 3, 0)]) .advance_watermark_to(2) .add_elements([tsv('k1', 6, 0)]) # This value is discarded. .advance_watermark_to_infinity()) # yapf: enable # Fixed, one-second windows with DefaultTrigger (after watermark) windowing = Windowing(FixedWindows(1), triggerfn=AfterWatermark(), allowed_lateness=0, accumulation_mode=AccumulationMode.DISCARDING) with TestPipeline() as p: result = ( p | test_stream | WindowInto(windowing.windowfn) | ParDo(trigger_manager._ReifyWindows()) | ParDo(trigger_manager._GroupBundlesByKey()) | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing)) | Map(lambda elm: (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]]))) assert_that( result, equal_to([ ('k1', IntervalWindow(0, 1), [1, 2, 3]), # On the watermark ]))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def expand(self, input): return (input | WindowInto(FixedWindows(self.duration)) | ExtractAndSumScore("team"))