def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. # # In the case of dynamic sharding, however, we use a default trigger since # the transform performs sharding also batches elements to avoid generating # too many tiny files. User trigger is applied right after writes to limit # the number of load jobs. if self.is_streaming_pipeline and not self.with_auto_sharding: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def load(events, metadata=None, pipeline_options=None): return ( events | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn()) # trigger fires when each sub-triger (executed in order) fires # repeatedly 1. after at least maxLogEvents in pane # 2. or finally when watermark pass the end of window # Repeatedly 1. after at least maxLogEvents in pane # 2. or processing time pass the first element in pane + delay | 'query10_fix_window' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), trigger=trigger.AfterEach( trigger.OrFinally( trigger.Repeatedly( trigger.AfterCount(metadata.get('max_log_events'))), trigger.AfterWatermark()), trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(metadata.get('max_log_events')), trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))), accumulation_mode=trigger.AccumulationMode.DISCARDING, # Use a 1 day allowed lateness so that any forgotten hold will stall # the pipeline for that period and be very noticeable. allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk' >> beam.GroupByKey() | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options) | 'query10_window_log_files' >> beam.WindowInto( window.FixedWindows(metadata.get('window_size_sec')), accumulation_mode=trigger.AccumulationMode.DISCARDING, allowed_lateness=Duration.of(1 * 24 * 60 * 60)) | 'query10_gbk_2' >> beam.GroupByKey() | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
def test_on_pane_watermark_hold_no_pipeline_stall(self): """A regression test added for https://issues.apache.org/jira/browse/BEAM-10054.""" START_TIMESTAMP = 1534842000 test_stream = TestStream() test_stream.add_elements(['a']) test_stream.advance_processing_time(START_TIMESTAMP + 1) test_stream.advance_watermark_to(START_TIMESTAMP + 1) test_stream.add_elements(['b']) test_stream.advance_processing_time(START_TIMESTAMP + 2) test_stream.advance_watermark_to(START_TIMESTAMP + 2) with TestPipeline(options=PipelineOptions(['--streaming'])) as p: # pylint: disable=expression-not-assigned (p | 'TestStream' >> test_stream | 'timestamp' >> beam.Map( lambda x: beam.window.TimestampedValue(x, START_TIMESTAMP)) | 'kv' >> beam.Map(lambda x: (x, x)) | 'window_1m' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=trigger.AfterAny(trigger.AfterProcessingTime(3600), trigger.AfterWatermark()), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'group_by_key' >> beam.GroupByKey() | 'filter' >> beam.Map(lambda x: x))
def _window_fn(self): """Set the correct WindowInto PTransform""" # The user-supplied triggering_frequency is often chosen to control how # many BigQuery load jobs are triggered, to prevent going over BigQuery's # daily quota for load jobs. If this is set to a large value, currently we # have to buffer all the data until the trigger fires. Instead we ensure # that the files are written if a threshold number of records are ready. # We use only the user-supplied trigger on the actual BigQuery load. # This allows us to offload the data to the filesystem. if self.is_streaming_pipeline: return beam.WindowInto(beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterProcessingTime( self.triggering_frequency), trigger.AfterCount( _FILE_TRIGGERING_RECORD_COUNT))), accumulation_mode=trigger.AccumulationMode\ .DISCARDING) else: return beam.WindowInto(beam.window.GlobalWindows())
def run(argv=None): class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument( '--input', default='projects/notbanana-7f869/topics/rsvps_source') parser.add_argument( '--output', default='projects/notbanana-7f869/topics/rsvps_out') options = PipelineOptions(flags=argv) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'notbanana-7f869' google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging' google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp' google_cloud_options.job_name = 'demo-job' """ -> Run the pipeline on the Cloud Dataflow runner. $ python pipelines/main.py --setup_file path/to/setup.py """ # options.view_as(StandardOptions).runner = 'DataflowRunner' with beam.Pipeline(options=options) as p: my_options = options.view_as(MyOptions) input_topic = my_options.input output_topic = my_options.output """ -> Consumes/collects events sent by the input Pub/Sub topic. @: id_label argument is a unique identifier used by the pipeline to deduplicate events : Exactly-once semantic. """ inputs = \ (p | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub( topic=input_topic, # id_label='event_id' ).with_output_types(six.binary_type) | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8')) | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)) | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn())) """ -> Outputs the total number of events globally processed by the pipeline. Triggering early results from the window every X seconds (processing time trigger) or triggering when the current pane has collected at least N elements (data-driven trigger) Values used are for testing purposes. """ (inputs | 'Apply Global Window' >> beam.WindowInto( beam.window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterAny( trigger.AfterCount(2), # AfterProcessingTime is experimental. # Not implemented yet. trigger.AfterProcessingTime(30))), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'Count events globally' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults() | 'Publish %s' % 'Events' >> WriteToPubSub( topic=output_topic, category=Category.GLOBAL_EVENTS)) """ -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. Values used are for testing purposes. NB: Using a custom TopFn that will deduplicate k/v pairs when using an accumulation strategy: SO - 56616576 @guillem-xercavins """ (inputs | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto( beam.window.FixedWindows(size=10 * 60), trigger=trigger.Repeatedly(trigger.AfterCount(5)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | beam.Map(lambda element: element['group']) | beam.ParDo(PairTopicWithOneFn()) | beam.CombinePerKey(sum) | 'Top 10 Topics' >> beam.CombineGlobally( TopDistinctFn( n=10, compare=lambda a, b: a[1] < b[1])).without_defaults() | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn()) | 'Publish %s' % 'Topics' >> WriteToPubSub( topic=output_topic, category=Category.HOT_TOPICS))