class GenerateRecords(beam.DoFn): EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.REAL_TIME) COUNT_STATE = CombiningValueStateSpec('count_state', VarIntCoder(), CountCombineFn()) def __init__(self, frequency, total_records): self.total_records = total_records self.frequency = frequency def process(self, element, emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)): # Processing time timers should be set on ABSOLUTE TIME. emit_timer.set(self.frequency) yield element[1] @on_timer(EMIT_TIMER) def emit_values(self, emit_timer=beam.DoFn.TimerParam(EMIT_TIMER), count_state=beam.DoFn.StateParam(COUNT_STATE)): count = count_state.read() or 0 if self.total_records == count: return count_state.add(1) # Processing time timers should be set on ABSOLUTE TIME. emit_timer.set(count + 1 + self.frequency) yield 'value'
def _pardo_group_into_batches(batch_size, input_coder): ELEMENT_STATE = BagStateSpec('values', input_coder) COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK) class _GroupIntoBatchesDoFn(DoFn): def process(self, element, window=DoFn.WindowParam, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), expiry_timer=DoFn.TimerParam(EXPIRY_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data expiry_timer.set(window.end) element_state.add(element) count_state.add(1) count = count_state.read() if count >= batch_size: batch = [element for element in element_state.read()] yield batch element_state.clear() count_state.clear() @on_timer(EXPIRY_TIMER) def expiry(self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE)): batch = [element for element in element_state.read()] if batch: yield batch element_state.clear() count_state.clear() return _GroupIntoBatchesDoFn()
def _pardo_group_into_batches( input_coder, batch_size, max_buffering_duration_secs, clock=time.time): ELEMENT_STATE = BagStateSpec('values', input_coder) COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK) BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME) class _GroupIntoBatchesDoFn(DoFn): def process( self, element, window=DoFn.WindowParam, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), window_timer=DoFn.TimerParam(WINDOW_TIMER), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data window_timer.set(window.end) element_state.add(element) count_state.add(1) count = count_state.read() if count == 1 and max_buffering_duration_secs > 0: # This is the first element in batch. Start counting buffering time if a # limit was set. # pylint: disable=deprecated-method buffering_timer.set(clock() + max_buffering_duration_secs) if count >= batch_size: return self.flush_batch(element_state, count_state, buffering_timer) @on_timer(WINDOW_TIMER) def on_window_timer( self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): return self.flush_batch(element_state, count_state, buffering_timer) @on_timer(BUFFERING_TIMER) def on_buffering_timer( self, element_state=DoFn.StateParam(ELEMENT_STATE), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): return self.flush_batch(element_state, count_state, buffering_timer) def flush_batch(self, element_state, count_state, buffering_timer): batch = [element for element in element_state.read()] if not batch: return key, _ = batch[0] batch_values = [v for (k, v) in batch] element_state.clear() count_state.clear() buffering_timer.clear() yield key, batch_values return _GroupIntoBatchesDoFn()
def expand(self, pcoll): output = (pcoll | "ParseJson" >> beam.ParDo(JsonToTaxiRide()) | "FilterForPickups" >> beam.Filter(lambda x: x.ride_status == 'pickup') | "WindowByMinute" >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(late=AfterCount(1)), allowed_lateness=60, accumulation_mode=AccumulationMode.ACCUMULATING) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults()) return output
class StatefulBufferingFn(DoFn): BUFFER_STATE = BagStateSpec('buffer', StrUtf8Coder()) COUNT_STATE = userstate.CombiningValueStateSpec( 'count', VarIntCoder(), CountCombineFn()) def process(self, element, buffer_state=beam.DoFn.StateParam(BUFFER_STATE), count_state=beam.DoFn.StateParam(COUNT_STATE)): key, value = element try: index_value = list(buffer_state.read()).index(value) except: index_value = -1 if index_value < 0: buffer_state.add(value) index_value = count_state.read() count_state.add(1) # print(value, list(buffer_state.read()).index(value), list(buffer_state.read())) yield ('{}_{}'.format(value, index_value), 1)
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_path', required=True, help='Path to events.json') parser.add_argument('--table_name', required=True, help='BigQuery table name') opts = parser.parse_args() # Setting up the Beam pipeline options options = PipelineOptions(save_main_session=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'batch-user-traffic-pipeline-', time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_path = opts.input_path table_name = opts.table_name # Table schema for BigQuery table_schema = { "fields": [ { "name": "user_id", "type": "STRING" }, { "name": "page_views", "type": "INTEGER" }, { "name": "total_bytes", "type": "INTEGER" }, { "name": "max_bytes", "type": "INTEGER" }, { "name": "min_bytes", "type": "INTEGER" }, ] } # Create the pipeline p = beam.Pipeline(options=options) (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path) | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog) | 'PerUserAggregations' >> beam.GroupBy('user_id').aggregate_field( 'user_id', CountCombineFn(), 'page_views').aggregate_field( 'num_bytes', sum, 'total_bytes').aggregate_field( 'num_bytes', max, 'max_bytes').aggregate_field( 'num_bytes', min, 'min_bytes').with_output_types(PerUserAggregation) | 'ToDict' >> beam.Map(to_dict) | 'WriteToBQ' >> beam.io.WriteToBigQuery( table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from Json into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_path', required=True, help='Path to events.json') parser.add_argument('--table_name', required=True, help='BigQuery table name') opts = parser.parse_args() # Setting up the Beam pipeline options options = PipelineOptions(save_main_session=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format( 'batch-minute-traffic-pipeline-', time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_path = opts.input_path table_name = opts.table_name # Table schema for BigQuery table_schema = { "fields": [ { "name": "page_views", "type": "INTEGER" }, { "name": "timestamp", "type": "STRING" }, ] } # Create the pipeline p = beam.Pipeline(options=options) (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path) | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog) | 'AddEventTimestamp' >> beam.Map(add_timestamp) | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60)) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults() | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn()) | 'WriteToBQ' >> beam.io.WriteToBigQuery( table_name, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def run(): # Command line arguments parser = argparse.ArgumentParser( description='Load from PubSub into BigQuery') parser.add_argument('--project', required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--accum_mode', required=True, help='Accumulation mode for pipeline') opts, pipeline_args = parser.parse_known_args() options = PipelineOptions(pipeline_args, save_main_session=True) options.view_as( GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}" options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as( GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(StandardOptions).runner = 'DataflowRunner' table_schema = { "fields": [ { "name": "taxi_events", "type": "INTEGER" }, { "name": "timestamp", "type": "STRING" }, ] } input_topic = "projects/pubsub-public-data/topics/taxirides-realtime" output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}" if opts.accum_mode == 'accumulating': accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING elif opts.accum_mode == 'discarding': accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING else: raise ValueError( 'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' ' ) p = beam.Pipeline(options=options) (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic) | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide) | 'WindowByMinute' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=AfterWatermark(early=AfterProcessingTime(10)), accumulation_mode=accum_mode) | "CountPerMinute" >> beam.CombineGlobally( CountCombineFn()).without_defaults() | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn()) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( output_table, schema=table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run()
def run(): # Command line arguments parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery') parser.add_argument('--project',required=True, help='Specify Google Cloud project') parser.add_argument('--region', required=True, help='Specify Google Cloud region') parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging') parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp') parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner') parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic') parser.add_argument('--agg_table_name', required=True, help='BigQuery table name for aggregate results') parser.add_argument('--raw_table_name', required=True, help='BigQuery table name for raw inputs') parser.add_argument('--window_duration', required=True, help='Window duration') opts = parser.parse_args() # Setting up the Beam pipeline options options = PipelineOptions(save_main_session=True, streaming=True) options.view_as(GoogleCloudOptions).project = opts.project options.view_as(GoogleCloudOptions).region = opts.region options.view_as(GoogleCloudOptions).staging_location = opts.staging_location options.view_as(GoogleCloudOptions).temp_location = opts.temp_location options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-pipeline-',time.time_ns()) options.view_as(StandardOptions).runner = opts.runner input_topic = opts.input_topic raw_table_name = opts.raw_table_name agg_table_name = opts.agg_table_name window_duration = opts.window_duration # Table schema for BigQuery agg_table_schema = { "fields": [ { "name": "page_views", "type": "INTEGER" }, { "name": "timestamp", "type": "STRING" }, ] } raw_table_schema = { "fields": [ { "name": "ip", "type": "STRING" }, { "name": "user_id", "type": "STRING" }, { "name": "user_agent", "type": "STRING" }, { "name": "lat", "type": "FLOAT", "mode": "NULLABLE" }, { "name": "lng", "type": "FLOAT", "mode": "NULLABLE" }, { "name": "event_timestamp", "type": "STRING" }, { "name": "processing_timestamp", "type": "STRING" }, { "name": "http_request", "type": "STRING" }, { "name": "http_response", "type": "INTEGER" }, { "name": "num_bytes", "type": "INTEGER" } ] } # Create the pipeline p = beam.Pipeline(options=options) parsed_msgs = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic) | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)) (parsed_msgs | "AddProcessingTimestamp" >> beam.Map(add_processing_timestamp) | 'WriteRawToBQ' >> beam.io.WriteToBigQuery( raw_table_name, schema=raw_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ) ) (parsed_msgs | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60)) | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults() | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn()) | 'WriteAggToBQ' >> beam.io.WriteToBigQuery( agg_table_name, schema=agg_table_schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ) ) logging.getLogger().setLevel(logging.INFO) logging.info("Building pipeline ...") p.run().wait_until_finish()
def _pardo_group_into_batches_with_multi_bags( input_coder, batch_size, max_buffering_duration_secs, clock=time.time): ELEMENT_STATE_0 = BagStateSpec('values0', input_coder) ELEMENT_STATE_1 = BagStateSpec('values1', input_coder) ELEMENT_STATE_2 = BagStateSpec('values2', input_coder) ELEMENT_STATE_3 = BagStateSpec('values3', input_coder) COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn()) WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK) BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME) class _GroupIntoBatchesDoFnWithMultiBags(DoFn): def process( self, element, window=DoFn.WindowParam, element_state_0=DoFn.StateParam(ELEMENT_STATE_0), element_state_1=DoFn.StateParam(ELEMENT_STATE_1), element_state_2=DoFn.StateParam(ELEMENT_STATE_2), element_state_3=DoFn.StateParam(ELEMENT_STATE_3), count_state=DoFn.StateParam(COUNT_STATE), window_timer=DoFn.TimerParam(WINDOW_TIMER), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): # Allowed lateness not supported in Python SDK # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data window_timer.set(window.end) count_state.add(1) count = count_state.read() element_states = [element_state_0, element_state_1, element_state_2, element_state_3] element_states[count % 4].add(element) if count == 1 and max_buffering_duration_secs > 0: # This is the first element in batch. Start counting buffering time if a # limit was set. buffering_timer.set(clock() + max_buffering_duration_secs) if count >= batch_size: return self.flush_batch(element_states, count_state, buffering_timer) @on_timer(WINDOW_TIMER) def on_window_timer( self, element_state_0=DoFn.StateParam(ELEMENT_STATE_0), element_state_1=DoFn.StateParam(ELEMENT_STATE_1), element_state_2=DoFn.StateParam(ELEMENT_STATE_2), element_state_3=DoFn.StateParam(ELEMENT_STATE_3), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): element_states = [element_state_0, element_state_1, element_state_2, element_state_3] return self.flush_batch(element_states, count_state, buffering_timer) @on_timer(BUFFERING_TIMER) def on_buffering_timer( self, element_state_0=DoFn.StateParam(ELEMENT_STATE_0), element_state_1=DoFn.StateParam(ELEMENT_STATE_1), element_state_2=DoFn.StateParam(ELEMENT_STATE_2), element_state_3=DoFn.StateParam(ELEMENT_STATE_3), count_state=DoFn.StateParam(COUNT_STATE), buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)): element_states = [element_state_0, element_state_1, element_state_2, element_state_3] return self.flush_batch(element_states, count_state, buffering_timer) def flush_batch(self, element_states, count_state, buffering_timer): batch_values = [] for element_state in element_states: for k, v in element_state.read(): key = k batch_values.append(v) element_state.clear() count_state.clear() buffering_timer.clear() if not batch_values: return yield key, batch_values return _GroupIntoBatchesDoFnWithMultiBags()