def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='short_tremor_data.csv', help='Input file to process.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=PipelineOptions()) as p: acceleration_data = ( p | 'Read' >> ReadFromText(known_args.input) # | 'Timestamp' >> beam.Map(lambda x: beam.window.TimestampedValue( x, extract_timestamp(x))) | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn())) windowed_data = ( acceleration_data | 'Window' >> beam.WindowInto(window.SlidingWindows(30, 10)) # | 'Count' >> (beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults()) | 'Clean' >> beam.ParDo(MagSumAcc()) # | 'Format' >> beam.ParDo(FormatDoFn()) ) added_value = (windowed_data | 'Count' >> (beam.CombineGlobally(b_mean).without_defaults())) # sums | 'Print' >> beam.ParDo(lambda (x): print('%s' % (x))) added_value | beam.ParDo(lambda (x): print(x))
def expand(self, pcoll): return (pcoll | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn()) | 'Window' >> beam.WindowInto( window.SlidingWindows(self.window_duration, self.window_overlap)) | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn()) | 'Extract:' >> beam.ParDo(ExtractDoFn()))
def test_sliding_windows(self): self.run_windowed_side_inputs( [1, 2, 4], window.SlidingWindows(size=6, period=2), window.SlidingWindows(size=6, period=2), expected=[ # Element 1 falls in three windows (1, [1]), # [-4, 2) (1, [1, 2]), # [-2, 4) (1, [1, 2, 4]), # [0, 6) # as does 2, (2, [1, 2]), # [-2, 4) (2, [1, 2, 4]), # [0, 6) (2, [2, 4]), # [2, 8) # and 4. (4, [1, 2, 4]), # [0, 6) (4, [2, 4]), # [2, 8) (4, [4]), # [4, 10) ])
def expand(self, pcoll): return ( pcoll | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn()) | 'Window' >> beam.WindowInto( window.SlidingWindows(self.window_duration, self.window_overlap)) | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn()) # Extract username/mag_sum_acc pairs from the event data. # | 'ExtractAndSumScore' >> ExtractAndMeanMagSumAcc('user') | 'Extract:' >> beam.ParDo(ExtractDoFn()))
def run(argv=None): # Use Python argparse module to parse custom arguments parser = argparse.ArgumentParser() parser.add_argument('--network') parser.add_argument('--input', dest='input', help='Input file to process.') parser.add_argument('--output', dest='output', help='Output file to write results to.') parser.add_argument('--output_topic', dest='out_topic', help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) parser.add_argument('--input_topic', dest='in_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) known_args, pipeline_args = parser.parse_known_args(argv) p_options = PipelineOptions(pipeline_args) google_cloud_options = p_options.view_as(GoogleCloudOptions) google_cloud_options.region = 'europe-west1' google_cloud_options.project = 'smartlive' '''google_cloud_options.job_name = 'dataflow-job-{}'.format( datetime.datetime.now().strftime("%Y-%m-%d%H%M%S") )''' google_cloud_options.staging_location = 'gs://rim-bucket/binaries' google_cloud_options.temp_location = 'gs://rim-bucket/temp' # Local execution we set the runner as "DirectRunner" # Cloud execution we set the runner as "DataflowRunner" p_options.view_as(StandardOptions).runner = 'DirectRunner' p_options.view_as(SetupOptions).save_main_session = True p_options.view_as(StandardOptions).streaming = True p_options.view_as(WorkerOptions).subnetwork = ( 'regions/europe-west1/subnetworks/test') p = beam.Pipeline(options=p_options) # Read from PubSub into a PCollection. lines = p | 'receive_data' >> beam.io.ReadFromPubSub( subscription=known_args.in_topic).with_input_types(str) \ | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \ | 'jsonload' >> beam.Map(lambda x: json.loads(x)) # -------------------window glissant ------------------- # lines| 'timestamp' >> beam.Map(get_timestamp) \ | 'window' >> beam.WindowInto(window.SlidingWindows(20, 10)) \ | 'Count' >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults()\ | 'printnbrarticles' >> beam.ParDo(PrintFn()) lines | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) \ | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) \ | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic) p.run().wait_until_finish()
def expand(self, pcoll): return ( pcoll # Assigns window info to each Pub/Sub message based on its # publish timestamp. | "Window into Sessions" #>> beam.WindowInto(window.Sessions(self.gap_size)) >> beam.WindowInto(window.SlidingWindows(10, 5)) | "Add timestamps to messages" >> beam.ParDo(AddTimestamps()) | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem)) | "Groupby" >> beam.GroupByKey() | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--project', help=('Google Cloud Project ID'), required=True) parser.add_argument( '--input_topic', help=('Google Cloud PubSub topic name '), required=True) known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions( pipeline_args.append('--project={}'.format(known_args.project))) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) TOPIC = 'projects/{}/topics/{}'.format(known_args.project, known_args.input_topic) # this table needs to exist table_spec = '{}:taxifare.traffic_realtime'.format(known_args.project) def to_bq_format(count): """BigQuery writer requires rows to be stored as python dictionary""" return {'trips_last_5min': count, 'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")} pipeline = (p | 'read_from_pubsub' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes) | 'window' >> beam.WindowInto(window.SlidingWindows( size=300, period=15)) | 'count' >> beam.CombineGlobally(CountFn()).without_defaults() | 'format_for_bq' >> beam.Map(to_bq_format) | 'write_to_bq' >> beam.io.WriteToBigQuery( table_spec, # WRITE_TRUNCATE not supported for streaming write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER) ) result = p.run()
def test_setting_sliding_windows(self): with TestPipeline() as p: unkeyed_items = p | beam.Create([2, 16, 23]) items = (unkeyed_items | 'key' >> beam.Map( lambda x: beam.window.TimestampedValue(('k', x), x))) # [START setting_sliding_windows] from apache_beam import window sliding_windowed_items = ( items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5))) # [END setting_sliding_windows] summed = (sliding_windowed_items | 'group' >> beam.GroupByKey() | 'combine' >> beam.CombineValues(sum)) unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1]) assert_that(unkeyed, equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
def load(events, metadata=None): return ( events | nexmark_query_util.JustBids() | 'query5_sliding_window' >> beam.WindowInto( window.SlidingWindows(metadata.get('window_size_sec'), metadata.get('window_period_sec'))) # project out only the auction id for each bid | 'extract_bid_auction' >> beam.Map(lambda bid: bid.auction) | 'bid_count_per_auction' >> beam.combiners.Count.PerElement() | 'bid_max_count' >> beam.CombineGlobally( MostBidCombineFn()).without_defaults() # TODO(leiyiz): fanout with sliding window produces duplicated results, # uncomment after it is fixed [BEAM-10617] # .with_fanout(metadata.get('fanout')) | beam.FlatMap(lambda auc_count: [{ ResultNames.AUCTION_ID: auction, ResultNames.NUM: auc_count[1] } for auction in auc_count[0]]))
def run(input_topic, output_topic, pipeline_args=None): pipeline_options = PipelineOptions(pipeline_args, streaming=True, save_main_session=True) with beam.Pipeline(options=pipeline_options) as p: (p | 'Read PubSub Messages' >> beam.io.ReadFromPubSub(topic=input_topic) # | 'ReadText' >> beam.io.ReadFromText("data.json") | 'ParseJSON' >> beam.ParDo(ParseData()) | 'AddEventTs' >> beam.ParDo(AddTimeStampFn()) | 'Windowing' >> beam.WindowInto( window.SlidingWindows(size=120, period=60), allowed_lateness=Duration(seconds=60)) | 'GroupByKey' >> beam.GroupByKey() | 'CalculateCorrelation' >> beam.ParDo(CalculateCorrelation()) | 'FilterCorrelation' >> beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD) | 'PublishCorrelation' >> beam.io.WriteToPubSub(output_topic))
def load(events, metadata=None): # find winning bids for each closed auction all_winning_bids = (events | beam.Filter(nexmark_query_util.auction_or_bid) | winning_bids.WinningBids()) return ( all_winning_bids # key winning bids by auction category | beam.Map(lambda auc_bid: (auc_bid.auction.category, auc_bid.bid.price)) # re-window for sliding average | beam.WindowInto( window.SlidingWindows(metadata.get('window_size_sec'), metadata.get('window_period_sec'))) # average for each category | beam.CombinePerKey(beam.combiners.MeanCombineFn()) # TODO(leiyiz): fanout with sliding window produces duplicated results, # uncomment after it is fixed [BEAM-10617] # .with_hot_key_fanout(metadata.get('fanout')) # produce output | beam.ParDo(ProjectToCategoryPriceFn()))
def movingAverageOf(p, project, event, speed_up_factor): averagingInterval = 3600 / speed_up_factor averagingFrequency = averagingInterval / 2 topic = "projects/{}/topics/{}".format(project, event) eventType = FieldNumberLookup.create(event) flights = ( p | "{}:read".format(event) >> beam.io.ReadFromPubSub(topic=topic) | "{}:window".format(event) >> beam.WindowInto( window.SlidingWindows(averagingInterval, averagingFrequency)) | "{}:parse".format(event) >> beam.Map(lambda elm: Flight(elm.decode("utf-8").split(","), eventType)) ) stats = {} stats["delay"] = ( flights | "{}:airport_delay".format(event) >> beam.Map(lambda elm: (elm.airport, elm.delay)) | "{}:avgdelay".format(event) >> beam.combiners.Mean.PerKey()) stats["timestamp"] = (flights | "{}:timestamps".format(event) >> beam.Map(lambda elm: (elm.airport, elm.timestamp)) | "{}:lastTimeStamp".format(event) >> beam.CombinePerKey(lambda elem: max(elem))) stats["num_flights"] = ( flights | "{}:numflights".format(event) >> beam.Map(lambda elm: (elm.airport, 1)) | "{}:total".format(event) >> beam.combiners.Count.PerKey()) return stats
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input_mode', default='file', help='Streaming input or file based batch input') for ticker in TICKER_LIST: parser.add_argument( '--input_{}'.format(ticker), default='{}_hist.csv'.format(ticker), help= 'Cloud Pub/Sub topic of tick market data for a stock, fall back to flat csv' ) parser.add_argument( '--output_topic', default='/tmp/trading_signals.txt', help='Topic of output trading signals in Google Cloud Pub/Sub') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True if known_args.input_mode == 'stream': pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read input input_stage = {} for ticker in TICKER_LIST: if known_args.input_mode == 'streaming': input_ticks = (p | beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types( six.binary_type)) else: input_ticks = (p | 'Read: %s' % ticker >> ReadFromText( getattr(known_args, 'input_%s' % ticker))) input_stage[ticker] = ( input_ticks | 'decode: %s' % ticker >> beam.Map(lambda x: x.decode('utf-8')) | 'Filter: %s' % ticker >> beam.Filter(lambda row: row.split(',')[0] != 'date') | 'Add Timestamp: %s' % ticker >> beam.ParDo(AddTimestampDoFn()) | 'Window: %s' % ticker >> beam.WindowInto( window.SlidingWindows(size=SECONDS_IN_1_DAY * 10, period=SECONDS_IN_1_DAY)) | 'Pair: %s' % ticker >> beam.ParDo(CorrelationPairDoFn(ticker))) # Group together all entries under the same ticker grouped = input_stage | 'group_by_name' >> beam.CoGroupByKey() correlations = (grouped | 'Calculate pair correlation' >> beam.Map(calculate_correlation_pair)) if known_args.input_mode == 'stream': trading_signals = ( correlations | 'Filter correlation threshold' >> beam.Filter( lambda x: x[1] < CORRELATION_THRESHOLD).with_output_types( six.binary_type)) # pylint: disable=expression-not-assigned trading_signals | beam.io.WriteToPubSub(known_args.output_topic) else: trading_signals = ( correlations | 'Filter correlation threshold' >> beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD)) # pylint: disable=expression-not-assigned trading_signals | 'WriteOutput' >> WriteToText( known_args.output_topic)
'gamma': 1.2 } model = xgb.train(best_params, dtrain, num_boost_round=1000, evals=watchlist, evals_result=evals_result, verbose_eval=True) test.loc[:, "predict"] = model.predict(dtest) return test[["shop_id", "date", "predict", "sales"]].to_dict(orient='records') (pipeline | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query)) | "Assign time" >> beam.Map(assign_timevalue) | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1)) | "Set group key" >> beam.Map(lambda v: ('shop_id', v)) | beam.GroupByKey() | "Learn and predict" >> beam.FlatMap(learn_predict) | "Write data" >> beam.Write( beam.io.BigQuerySink( 'dataset.table', schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER", write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))) pipeline.run()
def run(argv=None, save_main_session=True): # main function for running the pipeline arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--input_topic', dest='input_topic', required=True, help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) arg_parser.add_argument( '--output_load_table_suffix', dest='output_l', required=True, help=('Output BQ table to write results to (suffix). ' + '"[datasetID].[tableID]" ' + 'Since we have 8 buildings, each building ' + 'will be loaded on the corresponding table.' + 'ex) given argument, "energy.building", ' + 'building 1\'s data will be loaded in energy.building1')) arg_parser.add_argument( '--output_stream_table', dest='output_s', required=True, help='Output BQ table to write results to. "[datasetID].[tableID]"') arg_parser.add_argument( '--output_topic', dest='output_topic', required=True, help=('Output PubSub topic of the form ' + '"projects/<PROJECT>/topics/<TOPIC>".' + 'ex) "projects/building-energy-consumption/' + 'topics/energy_stream"')) arg_parser.add_argument( '--speedFactor', dest='speedFactor', required=False, default=300, type=int, help=('How wide do you want your window (in seconds) ' + '(Ex) 3600 => 1 hr window')) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should # store temp files, and what the project id is. known_args, pipeline_args = arg_parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=options) # Require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: arg_parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # Use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session rowToBQ = BQTranslateTransformation() # ingest pubsub messages, extract data, and save to lines # so it can be used by both batch ingest and stream aggregations lines = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types(bytes) | 'ConvertFromBytesToStr' >> beam.Map(lambda b: b.decode('utf-8'))) # Convert row of str to BQ rows, and load batch data to table # on a daily basis by setting batch_size to rows per day. # batch_size is a number of rows to be written to BQ # per streaming API insert. rows = (lines | 'StringToBigQueryRowLoad' >> beam.Map(lambda s: rowToBQ.parse_method_load(s))) # load_schema taken from json file extracted from processCSV.py # In a realistic scenario, you won't be able to automate it like this, # but probably have to manually insert schema load_schema = rowToBQ.schemas # filter and load to 8 tables based off of the given table suffix argument load1 = ( rows | 'FilterBuilding1' >> beam.Filter(lambda row: int(row['building_id']) == 1) | 'B1BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '1', schema=load_schema[0], batch_size=ROWS_PER_DAY)) load2 = ( rows | 'FilterBuilding2' >> beam.Filter(lambda row: int(row['building_id']) == 2) | 'B2BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '2', schema=load_schema[1], batch_size=ROWS_PER_DAY)) load3 = ( rows | 'FilterBuilding3' >> beam.Filter(lambda row: int(row['building_id']) == 3) | 'B3BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '3', schema=load_schema[2], batch_size=ROWS_PER_DAY)) load4 = ( rows | 'FilterBuilding4' >> beam.Filter(lambda row: int(row['building_id']) == 4) | 'B4BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '4', schema=load_schema[3], batch_size=ROWS_PER_DAY)) load5 = ( rows | 'FilterBuilding5' >> beam.Filter(lambda row: int(row['building_id']) == 5) | 'B5BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '5', schema=load_schema[4], batch_size=ROWS_PER_DAY)) load6 = ( rows | 'FilterBuilding6' >> beam.Filter(lambda row: int(row['building_id']) == 6) | 'B6BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '6', schema=load_schema[5], batch_size=ROWS_PER_DAY)) load7 = ( rows | 'FilterBuilding7' >> beam.Filter(lambda row: int(row['building_id']) == 7) | 'B7BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '7', schema=load_schema[6], batch_size=ROWS_PER_DAY)) load8 = ( rows | 'FilterBuilding8' >> beam.Filter(lambda row: int(row['building_id']) == 8) | 'B8BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '8', schema=load_schema[7], batch_size=ROWS_PER_DAY)) # stream aggregation pipeline; saved to avgs # to be used for writing to BigQuery and publishing to Pubsub # fixed window of 1 hour, adjusted according to speedFactor window_size = known_args.speedFactor avgs = (lines | 'SetTimeWindow' >> beam.WindowInto( window.SlidingWindows(window_size, float(window_size) / 2)) | 'ByBuilding' >> beam.ParDo(KVSplitDoFn()) | 'GetAvgByBuilding' >> Mean.PerKey() | 'AddWindowStartTimestamp' >> beam.ParDo(WindowStartTimestampFn())) # Convert row of str to BigQuery rows, and append to the BQ table. (avgs | 'StrToBigQueryRowStream' >> beam.Map(lambda s: rowToBQ.parse_method_stream(s)) | 'WriteToBigQueryStream' >> beam.io.WriteToBigQuery( table=known_args.output_s, schema=rowToBQ.stream_schema, project=options.view_as(GoogleCloudOptions).project)) # write message to pubsub with a different output_topic # for users to subscribe to and retrieve real time analysis data (avgs | 'Encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes) | 'PublishToPubSub' >> beam.io.WriteToPubSub('projects/{}/topics/{}'.format( options.view_as(GoogleCloudOptions).project, known_args.output_topic)) ) # nothing will run until this command p.run()
def run(argv=None, save_main_session=True): '''Build and run the pipeline.''' arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--input_topic', dest='input_topic', required=True, help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) arg_parser.add_argument( '--output_load_table_suffix', dest='output_l', required=True, help= ('Output BQ table to write results to (suffix). "[datasetID].[tableID]".' + 'Since we have 8 buildings, each building ' + 'will be loaded on the corresponding table. ex) given argument, "energy.building" ' + 'building 1\'s data will be loaded in energy.building1 ')) arg_parser.add_argument( '--output_stream_table', dest='output_s', required=True, help='Output BQ table to write results to. "[datasetID].[tableID]"') arg_parser.add_argument( '--output_topic', dest='output_topic', required=True, help=('Output PubSub topic of the form ' + '"projects/<PROJECT>/topics/<TOPIC>".' + 'ex) "projects/building-energy-consumption/' + 'topics/energy_stream"')) arg_parser.add_argument( '--speedFactor', dest='speedFactor', required=False, default=300, help=('How wide do you want your window (in seconds) ' + '(Ex) 3600 => 1 hr window')) known_args, pipeline_args = arg_parser.parse_known_args(argv) #logging.info('parsed args: {}'.format(known_args)) # Initiate the pipeline using the pipeline arguments passed in from the # command line. This includes information like where Dataflow should # store temp files, and what the project id is. options = PipelineOptions(pipeline_args) p = beam.Pipeline(options=options) # schema = parse_table_schema_from_json(data_ingestion.schema_str) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: arg_parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session rowToBQ = BQTranslateTransformation() # with open(SCHEMA_PATH) as bq_schema_file: # load_schema = json.load(load_schema_file) # stream_schema = json.load(load_schema_file) ''' if new columns need to be added, add by [SCHEMATYPE]_schema['fields'].append({ 'name': [FIELDNAME], 'type': [FIELDTYPE], 'mode': [FIELDMODE], }) ''' # ingest pubsub messages, extract data, and save to lines # so it can be used by both batch ingest and stream aggregations lines = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types(bytes) | 'ConvertFromBytesToStr' >> beam.Map(lambda b: b.decode('utf-8'))) # split to streaming inserts and batch load # because load is free and stream inserts costs money by size of data # Convert row of str to BQ rows, and load batch data to table on a daily basis # Set batch_size to rows per day to load sensor data in BQ on a daily basis # batch_size is a number of rows to be written to BQ per streaming API insert. rows = (lines | 'StringToBigQueryRowLoad' >> beam.Map(lambda s: rowToBQ.parse_method_load(s))) # load_schema taken from json file extracted from processCSV.py # In a realistic scenario, you won't be able to automate it like this, # but probably have to manually insert schema load_schema = rowToBQ.schemas # filter and load into 8 tables based off of the given table suffix argument load1 = ( rows | 'FilterBuilding1' >> beam.Filter(lambda row: int(row['building_id']) == 1) | 'B1BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '1', schema=load_schema[0], batch_size=ROWS_PER_DAY)) load2 = ( rows | 'FilterBuilding2' >> beam.Filter(lambda row: int(row['building_id']) == 2) | 'B2BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '2', schema=load_schema[1], batch_size=ROWS_PER_DAY)) load3 = ( rows | 'FilterBuilding3' >> beam.Filter(lambda row: int(row['building_id']) == 3) | 'B3BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '3', schema=load_schema[2], batch_size=ROWS_PER_DAY)) load4 = ( rows | 'FilterBuilding4' >> beam.Filter(lambda row: int(row['building_id']) == 4) | 'B4BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '4', schema=load_schema[3], batch_size=ROWS_PER_DAY)) load5 = ( rows | 'FilterBuilding5' >> beam.Filter(lambda row: int(row['building_id']) == 5) | 'B5BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '5', schema=load_schema[4], batch_size=ROWS_PER_DAY)) load6 = ( rows | 'FilterBuilding6' >> beam.Filter(lambda row: int(row['building_id']) == 6) | 'B6BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '6', schema=load_schema[5], batch_size=ROWS_PER_DAY)) load7 = ( rows | 'FilterBuilding7' >> beam.Filter(lambda row: int(row['building_id']) == 7) | 'B7BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '7', schema=load_schema[6], batch_size=ROWS_PER_DAY)) load8 = ( rows | 'FilterBuilding8' >> beam.Filter(lambda row: int(row['building_id']) == 8) | 'B8BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '8', schema=load_schema[7], batch_size=ROWS_PER_DAY)) # stream aggregation pipeline; saved to avgs # to be used for writing to BigQuery and publishing to Pubsub # fixed window of 1 hour, adjusted according to speedFactor window_size = known_args.speedFactor avgs = ( lines # | 'AddEventTimestamps' >> beam.Map(lambda s: window.TimestampedValue(s, # time.mktime(dateutil.parser.parse(s.split(',')[0]).timetuple()))) # | 'AddEventTimestamps' >> beam.ParDo(AddTimestampDoFn()) # | 'SetTimeWindow' >> beam.WindowInto(window.SlidingWindows(WINDOW_SIZE, WINDOW_PERIOD, offset=0)) # sliding window of [window_size] seconds, starting every [window_size/2] seconds | 'SetTimeWindow' >> beam.WindowInto( window.SlidingWindows(window_size, float(window_size) / 2)) # splitting to k,v of buildingId (2nd column), general meter reading (3rd column) # TODO: currently, groupbykey not working.. or the window is too wide that i have to wait a long time? # | 'ByBuilding' >> beam.Map(lambda s: (s.split(',')[1], int(float(s.split(',')[2])))) | 'ByBuilding' >> beam.ParDo(KVSplitDoFn()) | 'GetAvgByBuilding' >> Mean.PerKey() # | 'CountByBuilding' >> Count.PerKey()) | 'AddWindowStartTimestamp' >> beam.ParDo(WindowStartTimestampFn())) # Convert row of str to BigQuery rows, and append to the BQ table. (avgs | 'StrToBigQueryRowStream' >> beam.Map(lambda s: rowToBQ.parse_method_stream(s)) | 'WriteToBigQueryStream' >> beam.io.WriteToBigQuery( table=known_args.output_s, schema=rowToBQ.stream_schema, project=options.view_as(GoogleCloudOptions).project)) # write message to pubsub with a different output_topic # for users to subscribe to and retrieve real time analysis data (avgs | 'Encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes) | 'PublishToPubSub' >> beam.io.WriteToPubSub('projects/{}/topics/{}'.format( options.view_as(GoogleCloudOptions).project, known_args.output_topic)) ) p.run()
def run(argv=None): """Build and run the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--output_topic', required=True, help=('Output PubSub topic of the form ' '"projects/<PROJECT>/topic/<TOPIC>".')) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--input_topic', help=('Input PubSub topic of the form ' '"projects/<PROJECT>/topics/<TOPIC>".')) group.add_argument( '--input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=pipeline_options) # Read from PubSub into a PCollection. if known_args.input_subscription: lines = p | beam.io.ReadStringsFromPubSub( subscription=known_args.input_subscription) else: lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = ( lines # | 'print1' >> beam.Map(print) | 'split' >> beam.Map(Extracting_X_Value) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.WindowInto(window.SlidingWindows(10, 1, 0)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Branch 1: Alert when x hits -1.0 = 10 x times, by writing a message to PubSub alert = (counts | 'filter' >> beam.ParDo(Alerting_X_Value()) | 'write_to_pubsub' >> beam.io.WriteStringsToPubSub( known_args.output_topic)) # Branch 2: Print out the output # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '[{}] {}: {}'.format(time.ctime(), word, count) output = (counts | 'format' >> beam.Map(format_result) | 'print' >> beam.Map(print)) result = p.run() result.wait_until_finish()