def load(events, metadata=None, pipeline_options=None): return ( events | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) # trigger fires when each sub-triger (executed in order) fires # repeatedly 1. after at least maxLogEvents in pane # 2. or finally when watermark pass the end of window # Repeatedly 1. after at least maxLogEvents in pane # 2. or processing time pass the first element in pane + delay | 'query10_fix_window' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), trigger=trigger.AfterEach( trigger.OrFinally( trigger.Repeatedly( trigger.AfterCount(metadata.get('max_log_events'))), trigger.AfterWatermark()), trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(metadata.get('max_log_events')), trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), accumulation_mode=trigger.AccumulationMode.DISCARDING, # Use a 1 day allowed lateness so that any forgotten hold will stall # the pipeline for that period and be very noticeable. allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk' >> beam.GroupByKey() | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) | 'query10_window_log_files' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk_2' >> beam.GroupByKey() | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
def expand(self, pcoll): return (pcoll | 'LeaderboardTeamFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(self.team_window_duration), trigger=trigger.AfterWatermark(trigger.AfterCount(10), trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
def expand(self, pcoll): return ( pcoll # We will get early (speculative) results as well as cumulative # processing of late data. | 'LeaderboardTeamFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(self.team_window_duration), trigger=trigger.AfterWatermark(trigger.AfterCount(10), trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=self.allowed_lateness_seconds) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
def expand(self, pcoll): logging.info("Calculate group values: {}".format(pcoll)) return ( pcoll # We will get early (speculative) results as well as cumulative # processing of late data. | 'HighValueGroupFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(self.group_window_duration), trigger=trigger.AfterWatermark(trigger.AfterCount(10), trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum group/value pairs from the event data. | 'ExtractAndSumValue' >> ExtractAndSumValue('group'))
def expand(self, pcoll): # NOTE: the behavior does not exactly match the Java example # TODO: allowed_lateness not implemented yet in FixedWindows # TODO: AfterProcessingTime not implemented yet, replace AfterCount return ( pcoll # We will get early (speculative) results as well as cumulative # processing of late data. | 'LeaderboardTeamFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(self.team_window_duration), trigger=trigger.AfterWatermark(trigger.AfterCount(10), trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team'))
def run_combine(pipeline, input_elements=5, lift_combiners=True): # Calculate the expected result, which is the sum of an arithmetic sequence. # By default, this is equal to: 0 + 1 + 2 + 3 + 4 = 10 expected_result = input_elements * (input_elements - 1) / 2 # Enable runtime type checking in order to cover TypeCheckCombineFn by # the test. pipeline.get_pipeline_options().view_as( TypeOptions).runtime_type_check = True pipeline.get_pipeline_options().view_as( TypeOptions).allow_unsafe_triggers = True with pipeline as p: pcoll = p | 'Start' >> beam.Create(range(input_elements)) # Certain triggers, such as AfterCount, are incompatible with combiner # lifting. We can use that fact to prevent combiners from being lifted. if not lift_combiners: pcoll |= beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterCount(input_elements), accumulation_mode=trigger.AccumulationMode.DISCARDING) # Pass an additional 'None' in order to cover _CurriedFn by the test. pcoll |= 'Do' >> beam.CombineGlobally( combiners.SingleInputTupleCombineFn( CallSequenceEnforcingCombineFn(), CallSequenceEnforcingCombineFn()), None).with_fanout(fanout=1) assert_that(pcoll, equal_to([(expected_result, expected_result)]))
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" # TODO(BEAM-9322): Remove use of this experiment. # This flag is only necessary when using the multi-output TestStream b/c # it relies on using the PCollection output tags as the PCollection output # ids. p = TestPipeline(additional_pipeline_args=[ '--experiments=' + 'passthrough_pcollection_output_ids' ]) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def load(events, metadata=None, pipeline_options=None): num_events_in_pane = 30 windowed_events = ( events | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)), accumulation_mode=trigger.AccumulationMode.DISCARDING)) auction_by_seller_id = ( windowed_events | nexmark_query_util.JustAuctions() | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10) | 'query3_key_by_seller' >> beam.ParDo( nexmark_query_util.AuctionBySellerFn())) person_by_id = ( windowed_events | nexmark_query_util.JustPerson() | 'query3_filter_region' >> beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA']) | 'query3_key_by_person_id' >> beam.ParDo( nexmark_query_util.PersonByIdFn())) return ({ nexmark_query_util.AUCTION_TAG: auction_by_seller_id, nexmark_query_util.PERSON_TAG: person_by_id, } | beam.CoGroupByKey() | 'query3_join' >> beam.ParDo( JoinFn(metadata.get('max_auction_waiting_time'))) | 'query3_output' >> beam.Map( lambda t: { ResultNames.NAME: t[1].name, ResultNames.CITY: t[1].city, ResultNames.STATE: t[1].state, ResultNames.AUCTION_ID: t[0].id }))
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. # # In the case of dynamic sharding, however, we use a default trigger since # the transform performs sharding also batches elements to avoid generating # too many tiny files. User trigger is applied right after writes to limit # the number of load jobs. if self.is_streaming_pipeline and not self.with_auto_sharding: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def expand(self, pcoll): return (pcoll | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements([TimestampedValue('a', 11)]) .advance_watermark_to(20) .add_elements([TimestampedValue('b', 21)]) .advance_watermark_to_infinity()) # yapf: disable options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = ( p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey()) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # assert per window expected_window_to_elements = { window.IntervalWindow(0, 15): [('k', ['a']), ('k', [])], window.IntervalWindow(15, 30): [('k', ['b']), ('k', [])], } assert_that(records, equal_to_per_window(expected_window_to_elements), label='assert per window') p.run()
def expand(self, pcoll): return (pcoll | 'TweetGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(50)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=self.allowed_lateness_seconds) # Extract and sum username/score pairs from the event data. | 'ExtractTweets' >> ExtractTweets('user_id'))
def expand(self, p): # NOTE: allowed_lateness is not yet available in Python FixedWindows. # NOTE: AfterProcessingTime not yet available in Python. return ( p | 'window' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.AfterWatermark(early=trigger.AfterCount(100)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'extract_user_score' >> ExtractAndSumScore('user'))
def expand(self, pcoll): return (pcoll # Get periodic results every ten events. | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=self.allowed_lateness_seconds) # Extract and sum username/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def test_multi_triggered_gbk_side_input(self): """Test a GBK sideinput, with multiple triggering.""" options = StandardOptions(streaming=True) p = TestPipeline(options=options) test_stream = ( p | 'Mixed TestStream' >> TestStream().advance_watermark_to( 3, tag='main').add_elements( ['a1'], tag='main').advance_watermark_to( 8, tag='main').add_elements(['a2'], tag='main'). add_elements([window.TimestampedValue( ('k', 100), 2)], tag='side').add_elements( [window.TimestampedValue(('k', 400), 7)], tag='side').advance_watermark_to_infinity( tag='main').advance_watermark_to_infinity(tag='side')) main_data = ( test_stream['main'] | 'Main windowInto' >> beam.WindowInto( window.FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING)) side_data = ( test_stream['side'] | 'Side windowInto' >> beam.WindowInto( window.FixedWindows(5), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.CombinePerKey(sum) | 'Values' >> Map(lambda k_vs: k_vs[1])) class RecordFn(beam.DoFn): def process(self, elm=beam.DoFn.ElementParam, ts=beam.DoFn.TimestampParam, side=beam.DoFn.SideInputParam): yield (elm, ts, side) records = (main_data | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_data))) expected_window_to_elements = { window.IntervalWindow(0, 5): [ ('a1', Timestamp(3), [100, 0]), ], window.IntervalWindow(5, 10): [('a2', Timestamp(8), [400, 0])], } assert_that(records, equal_to_per_window(expected_window_to_elements), use_global_window=False, label='assert per window') p.run()
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = ( destination_data_kv_pc | beam.ParDo( WriteRecordsToFile( schema=self.schema, max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile( schema=self.schema, file_format=self._temp_file_format), file_prefix_pcv, *self.schema_side_inputs)) # TODO(BEAM-9494): Remove the identity transform. We flatten both # PCollection paths and use an identity function to work around a # flatten optimization issue where the wrong coder is being used. all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten() | "IdentityWorkaround" >> beam.Map(lambda x: x)) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime(self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def expand(self, pcoll): logging.info("Calculate user values: {}".format(pcoll)) return (pcoll # Get periodic results every ten events. | 'HighValueUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum username/value pairs from the event data. | 'ExtractAndSumValue' >> ExtractAndSumValue('user'))
def expand(self, pcoll): # NOTE: the behavior does not exactly match the Java example # TODO: allowed_lateness not implemented yet in FixedWindows # TODO: AfterProcessingTime not implemented yet, replace AfterCount return (pcoll # Get periodic results every ten events. | 'LeaderboardUserGlobalWindows' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) # Extract and sum username/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
def _maybe_apply_user_trigger(self, destination_file_kv_pc): if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs return (destination_file_kv_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) else: return destination_file_kv_pc
def _write_files(self, destination_data_kv_pc, file_prefix_pcv): outputs = (destination_data_kv_pc | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) if self.is_streaming_pipeline: # Apply the user's trigger back before we start triggering load jobs all_destination_file_pairs_pc = ( all_destination_file_pairs_pc | "ApplyUserTrigger" >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAll( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount(1))), accumulation_mode=trigger.AccumulationMode.DISCARDING)) return all_destination_file_pairs_pc
def load(events, metadata=None): # find winning bids for each closed auction return (events # find winning bids | beam.Filter(nexmark_query_util.auction_or_bid) | winning_bids.WinningBids() # (auction_bids -> (aution.seller, bid) | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid)) # calculate and output mean as data arrives | beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, allowed_lateness=0) | beam.CombinePerKey(MovingMeanSellingPriceFn(10)) | beam.Map(lambda t: { ResultNames.SELLER: t[0], ResultNames.PRICE: t[1] }))
def load(events, metadata=None, pipeline_options=None): return (events # filter to get only bids and then extract bidder id | nexmark_query_util.JustBids() | 'query11_extract_bidder' >> beam.Map(lambda bid: bid.bidder) # window auction and key by auctions' seller | 'query11_session_window' >> beam.WindowInto( window.Sessions(metadata.get('window_size_sec')), trigger=trigger.AfterWatermark( early=trigger.AfterCount(metadata.get('max_log_events'))), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=metadata.get('occasional_delay_sec') // 2) # count per bidder | beam.combiners.Count.PerElement() | beam.Map( lambda bidder_count: { ResultNames.BIDDER_ID: bidder_count[0], ResultNames.BID_COUNT: bidder_count[1] }))
def test(self): _ = ( self.pipeline | 'Read from pubsub' >> ReadFromPubSub( subscription=self.read_sub_name, with_attributes=True, id_label='id', ) | beam.Map(lambda x: bytes(1)).with_output_types(bytes) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterCount(self.num_of_messages)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'Count messages' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults(). with_output_types(int) | 'Convert to bytes' >> beam.Map(lambda count: str(count).encode('utf-8')) | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. if self.is_streaming_pipeline: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def run(): options = PipelineOptions([ "--runner=PortableRunner", "--job_endpoint=localhost:8099", "--environment_type=LOOPBACK" ]) # options = PipelineOptions([ # "--runner=FlinkRunner", # "--flink_master=localhost:8081", # ]) with beam.Pipeline(options=options) as p: (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1])) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn()))
def test_gbk_execution_after_watermark_trigger(self): test_stream = (TestStream() .advance_watermark_to(10) .add_elements(['a']) .advance_watermark_to(20)) # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. global result # pylint: disable=global-variable-undefined result = [] def fired_elements(elem): result.append(elem) return elem options = PipelineOptions() options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) records = (p # pylint: disable=unused-variable | test_stream | beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | beam.Map(lambda x: ('k', x)) | beam.GroupByKey() | beam.Map(fired_elements)) # TODO(BEAM-2519): timestamp assignment for elements from a GBK should # respect the TimestampCombiner. The test below should also verify the # timestamps of the outputted elements once this is implemented. # TODO(BEAM-3377): Reinstate after assert_that in streaming is fixed. # assert_that(records, equal_to([ # ('k', ['a']), ('k', [])])) p.run() # TODO(BEAM-3377): Remove after assert_that in streaming is fixed. self.assertEqual([('k', ['a']), ('k', [])], result)
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p | 'ReadFromKafka' >> ReadFromKafka( consumer_config={"bootstrap.servers": "localhost:9092"}, topics=["beam-input"]) | 'ExtractWords' >> beam.FlatMap(lambda (k, v): re.findall(r'[A-Za-z\']+', v)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1)), accumulation_mode=AccumulationMode.ACCUMULATING) | 'Count' >> beam.combiners.Count.PerElement() | 'Format' >> beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1])) | 'Log' >> beam.ParDo(LoggingDoFn())) result = p.run() result.wait_until_finish()
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', default=TW_INPUT) parser.add_argument('--output', default=TW_OUTPUT) options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.staging_location = STAGING_LOCATION google_cloud_options.temp_location = TEMP_LOCATION google_cloud_options.flexrs_goal = 'COST_OPTIMIZED' # google_cloud_options.job_name = 'hashtags-battle-job' """ -> Uncomment this to run the pipeline on the Cloud Dataflow runner. $ python main.py --setup_file ./setup.py --machine_type=n1-standard-2 --max_num_workers=2 --disk_size_gb=30 """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))) # | 'Add Event Time' >> beam.ParDo(AddTimestampFn()) """ -> Extracts hashtags array from object. """ hashtags = \ (inputs | 'Get Hashtags' >> beam.Map(lambda element: element['hashtags']) | 'Explode Hashtags' >> beam.FlatMap(lambda element: element)) """ -> Outputs a batch of pre-aggregated hashtags. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (hashtags | 'Apply Daily Window' >> beam.WindowInto( beam.window.FixedWindows(SECONDS_IN_1_DAY), trigger=trigger.Repeatedly(trigger.AfterCount(10)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Grouping Hashtags' >> PairWithOneCombine() | 'Format Hashtags' >> beam.ParDo(FormatHashtagFn()) | 'Batch Hashtags' >> beam.BatchElements(min_batch_size=49, max_batch_size=50) | 'Publish Hashtags' >> WriteToPubSub( topic=output_topic, category=Category.DAILY_HASHTAGS)) """ -> Outputs the sum of processed events for a given fixed-time window. """ (hashtags | 'Apply 5 Minutes' >> beam.WindowInto( beam.window.FixedWindows(size=5 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(20)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'CG+CC' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish Events Sum' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 5 trending hashtags within a given fixed-time window. """ (hashtags | 'Apply %s Min FW' % '30' >> beam.WindowInto( beam.window.FixedWindows(size=SECONDS_IN_HALF_HOUR), trigger=trigger.Repeatedly(trigger.AfterCount(2)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Grouping Trends' >> PairWithOneCombine() | '%s Trending Hashtags' % TRENDING_HASHTAGS_LIMIT >> beam.CombineGlobally( TopDistinctFn( n=TRENDING_HASHTAGS_LIMIT, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'Format Trending Hashtags' >> beam.ParDo(FormatHashtagsFn()) | 'Publish Trending Hashtags' >> WriteToPubSub( topic=output_topic, category=Category.TRENDING_HASHTAGS))
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(is_integration_test=True, options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--input', default='projects/notbanana-7f869/topics/rsvps_source') parser.add_argument( '--output', default='projects/notbanana-7f869/topics/rsvps_out') options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'notbanana-7f869' google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging' google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp' google_cloud_options.job_name = 'demo-job' """ -> Run the pipeline on the Cloud Dataflow runner. $ python pipelines/main.py --setup_file path/to/setup.py """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)) | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn())) """ -> Outputs the total number of events globally processed by the pipeline. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (inputs | 'Apply Global Window' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(2), # AfterProcessingTime is experimental. # Not implemented yet. trigger.AfterProcessingTime(30))), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Count events globally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish %s' % 'Events' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. Values used are for testing purposes. NB: Using a custom TopFn that will deduplicate k/v pairs when using an accumulation strategy: SO - 56616576 @guillem-xercavins """ (inputs | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto( beam.window.FixedWindows(size=10 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(5)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | beam.Map(lambda element: element['group']) | beam.ParDo(PairTopicWithOneFn()) | beam.CombinePerKey(sum) | 'Top 10 Topics' >> beam.CombineGlobally( TopDistinctFn( n=10, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn()) | 'Publish %s' % 'Topics' >> WriteToPubSub( topic=output_topic, category=Category.HOT_TOPICS))