def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_watermark_to(20)) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed. # assert_that(records, equal_to([ # ('k', ['a']), ('k', [])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('k', ['a']), ('k', [])], result)
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_watermark_to(20) .advance_watermark_to_infinity()) options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(15, 30): [ ('k', ['a']), ('k', []), ], } assert_that( records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements([TimestampedValue('a', 11)]) .advance_watermark_to(20) .add_elements([TimestampedValue('b', 21)]) .advance_watermark_to_infinity()) # yapf: disable options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = ( p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(https://github.com/apache/beam/issues/18441): timestamp assignment # for elements from a GBK should respect the TimestampCombiner. The test # below should also verify the timestamps of the outputted elements once # this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a']), ('k', [])], window.IntervalWindow(15, 30): [('k', ['b']), ('k', [])], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def test_streaming_complex_timing(self): # Use state on the TestCase class, since other references would be pickled # into a closure and not have the desired side effects. # # TODO(BEAM-5295): Use assert_that after it works for the cases here in # streaming mode. WriteFilesTest.all_records = [] dir = '%s%s' % (self._new_tempdir(), os.sep) # Setting up the input (TestStream) ts = TestStream().advance_watermark_to(0) for elm in WriteFilesTest.LARGER_COLLECTION: timestamp = int(elm) ts.add_elements([('key', '%s' % elm)]) if timestamp % 5 == 0 and timestamp != 0: # TODO(BEAM-3759): Add many firings per window after getting PaneInfo. ts.advance_processing_time(5) ts.advance_watermark_to(timestamp) ts.advance_watermark_to_infinity() def no_colon_file_naming(*args): file_name = fileio.destination_prefix_naming()(*args) return file_name.replace(':', '_') # The pipeline that we are testing options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: res = (p | ts | beam.WindowInto( FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.GroupByKey() | beam.FlatMap(lambda x: x[1])) # Triggering after 5 processing-time seconds, and on the watermark. Also # discarding old elements. _ = (res | beam.io.fileio.WriteToFiles( path=dir, file_naming=no_colon_file_naming, max_writers_per_bundle=0) | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name)) | beam.ParDo(self.record_dofn())) # Verification pipeline with TestPipeline() as p: files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*'))) file_names = (files | beam.Map(lambda fm: fm.path)) file_contents = ( files | beam.io.fileio.ReadMatches() | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip( ).split('\n')))) content = (file_contents | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]])) assert_that(file_names, equal_to(WriteFilesTest.all_records), label='AssertFilesMatch') assert_that(content, matches_all(WriteFilesTest.LARGER_COLLECTION), label='AssertContentsMatch')
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def run(argv=None): # Add command line arguments parser = argparse.ArgumentParser() parser.add_argument( '--output', required=True, help= 'Output BigQuery table for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.' ) parser.add_argument( '--input_subscription', required=True, help= 'Input PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."' ) parser.add_argument( '--output_subscription', required=True, help= 'Output PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."' ) known_args, pipeline_args = parser.parse_known_args(argv) # Set pipeline options pipeline_options = PipelineOptions(pipeline_args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Main pipeline: read in Logs, write them to BigQuery message_table = 'logs' messages = (p | 'Read from PubSub' >> beam.io.ReadFromPubSub( subscription=known_args.input_subscription). with_output_types(bytes) | 'Decode messages' >> beam.Map(lambda x: x.decode('utf-8')) | 'Parse messages to Logs ' >> beam.ParDo(MessageToLog()) | 'Detect language' >> beam.ParDo(TranslateMessage())) (messages | 'Convert Log to BigQuery records' >> beam.Map( json_to_bqrecords.json_to_bqrecord) | 'Write Logs to BigQuery' >> beam.io.WriteToBigQuery( known_args.output + message_table, schema=json_schema.log_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # Calculate aggregates per language, write to BigQuery language_aggregate_table = 'languages' languages = (messages | 'Extract language tuple' >> (beam.Map(lambda x: (x.translate_language, x))) | 'Assign Fixed Windows' >> beam.WindowInto( window.FixedWindows(60, 0), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'GroupByKey Languages' >> beam.GroupByKey() | 'Count languages' >> beam.ParDo(LanguageAggregate())) (languages | 'Convert language aggregate to BigQuery records' >> beam.Map( json_to_bqrecords.language_aggregate_to_bqrecords) | 'Write LanguageAggregate to BigQuery' >> beam.io.WriteToBigQuery( known_args.output + language_aggregate_table, schema=json_schema.language_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) (languages | 'Convert language aggregate to PubSub message' >> beam.Map( json_to_bqrecords.language_aggregate_to_pubsubmessage) | 'Encode' >> beam.Map(lambda x: json.dumps(x, ensure_ascii=False).encode( 'utf-8')).with_output_types(bytes) | 'Write LanguageAggregate to PubSub' >> beam.io.WriteToPubSub( known_args.output_subscription)) # Calculate aggregates per user, write to user_aggregate_table = 'users' (messages | 'Extract user tuple' >> (beam.Map(lambda x: (x.user_id, x))) | 'Assign Sessions' >> beam.WindowInto( window.Sessions(30), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'GroupByKey Users' >> beam.GroupByKey() | 'Count user' >> beam.ParDo(UserAggregate()) | 'Convert user aggregate to BigQuery records' >> beam.Map( json_to_bqrecords.user_aggregate_to_bqrecords) | 'Write UserAggregate to BigQuery' >> beam.io.WriteToBigQuery( known_args.output + user_aggregate_table, schema=json_schema.user_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) result = p.run() result.wait_until_finish()
def main(argv=None): def json_parser(x): parsed = json.loads(x) return parsed def bye(x): logging.info('outing: %s', x) return x parser = argparse.ArgumentParser() parser.add_argument("--input_topic") parser.add_argument("--output_topic") known_args = parser.parse_known_args(argv) p = beam.Pipeline(options=PipelineOptions()) data = (p | 'ReadData' >> beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes) | "JSONParse" >> beam.Map(json_parser)) (data | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"]) | "Windowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))), accumulation_mode=tr.AccumulationMode.DISCARDING, allowed_lateness=0) | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye' >> beam.Map(bye) | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC)) (data | "SlidWindowing" >> beam.WindowInto( window.FixedWindows(60), trigger=(tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))), late=tr.Repeatedly(tr.AfterCount(1)))), allowed_lateness=300, accumulation_mode=tr.AccumulationMode.ACCUMULATING) | "Extract" >> beam.Map(lambda x: x["meter_increment"]) | "Sum_up" >> beam.CombineGlobally(sum).without_defaults() | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x}) | "Enrich with time data" >> beam.ParDo(Enrich()) | "ToBytesCount" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye2' >> beam.Map(bye) | "WriteCount" >> beam.io.WriteToPubSub(TOPIC)) (data | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"]) | "SessionWindowing" >> beam.WindowInto( window.Sessions(60), trigger=tr.AfterWatermark(early=tr.Repeatedly( tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))), accumulation_mode=tr.AccumulationMode.ACCUMULATING, allowed_lateness=0) | "GroupInPickup" >> beam.CombinePerKey(PickupFn()) | "Discarding Key" >> beam.Map(lambda x: x[1]) | "Filter not pickup" >> beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None) | "ToBytesPickup" >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8')) | 'Bye3' >> beam.Map(bye) | "WritePickup" >> beam.io.WriteToPubSub(TOPIC)) result = p.run() result.wait_until_finish()
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--network') parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') parser.add_argument('--output_topic', dest='out_topic', help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument('--input_topic', dest='in_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test' ) p = beam.Pipeline(options=p_options) lines = p | 'receive_data' >> beam.io.ReadFromPubSub( subscription=known_args.in_topic).with_input_types(str) \ | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x)) '''tab = [] for i in range(len(lines)): test = {} test['time'] = lines[i]['timestamp']''' # ----- window fixe + Trigger AfterWatermark + Accumulating mode ------ # (lines |'timestamp' >> beam.Map(get_timestamp) | 'window' >> beam.WindowInto( window.FixedWindows(10), trigger=trigger.AfterWatermark(), accumulation_mode=trigger.AccumulationMode.DISCARDING ) | 'CountGlobally' >> beam.CombineGlobally( beam.combiners.CountCombineFn() ).without_defaults() | 'printnbrarticles' >> beam.ParDo(PrintFn()) | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic) ) p.run().wait_until_finish()
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. options = StandardOptions(streaming=True) options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()