Esempio n. 1
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='short_tremor_data.csv',
                        help='Input file to process.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=PipelineOptions()) as p:
        acceleration_data = (
            p
            | 'Read' >> ReadFromText(known_args.input)
            # | 'Timestamp' >> beam.Map(lambda x: beam.window.TimestampedValue( x, extract_timestamp(x)))
            | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn()))

        windowed_data = (
            acceleration_data
            | 'Window' >> beam.WindowInto(window.SlidingWindows(30, 10))
            # | 'Count' >> (beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults())
            | 'Clean' >> beam.ParDo(MagSumAcc())
            # | 'Format' >> beam.ParDo(FormatDoFn())
        )

        added_value = (windowed_data
                       | 'Count' >>
                       (beam.CombineGlobally(b_mean).without_defaults()))

        # sums | 'Print' >> beam.ParDo(lambda (x): print('%s' % (x)))
        added_value | beam.ParDo(lambda (x): print(x))
Esempio n. 2
0
 def expand(self, pcoll):
     return (pcoll
             | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn())
             | 'Window' >> beam.WindowInto(
                 window.SlidingWindows(self.window_duration,
                                       self.window_overlap))
             | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn())
             | 'Extract:' >> beam.ParDo(ExtractDoFn()))
Esempio n. 3
0
 def test_sliding_windows(self):
     self.run_windowed_side_inputs(
         [1, 2, 4],
         window.SlidingWindows(size=6, period=2),
         window.SlidingWindows(size=6, period=2),
         expected=[
             # Element 1 falls in three windows
             (1, [1]),  # [-4, 2)
             (1, [1, 2]),  # [-2, 4)
             (1, [1, 2, 4]),  # [0, 6)
             # as does 2,
             (2, [1, 2]),  # [-2, 4)
             (2, [1, 2, 4]),  # [0, 6)
             (2, [2, 4]),  # [2, 8)
             # and 4.
             (4, [1, 2, 4]),  # [0, 6)
             (4, [2, 4]),  # [2, 8)
             (4, [4]),  # [4, 10)
         ])
Esempio n. 4
0
 def expand(self, pcoll):
     return (
         pcoll
         | 'Timestamp' >> beam.ParDo(TransformTimestampDoFn())
         | 'Window' >> beam.WindowInto(
             window.SlidingWindows(self.window_duration,
                                   self.window_overlap))
         | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn())
         # Extract username/mag_sum_acc pairs from the event data.
         # | 'ExtractAndSumScore' >> ExtractAndMeanMagSumAcc('user')
         | 'Extract:' >> beam.ParDo(ExtractDoFn()))
Esempio n. 5
0
def run(argv=None):
    # Use Python argparse module to parse custom arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--network')
    parser.add_argument('--input', dest='input', help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to.')
    parser.add_argument('--output_topic',
                        dest='out_topic',
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument('--input_topic',
                        dest='in_topic',
                        help=('Input PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    known_args, pipeline_args = parser.parse_known_args(argv)
    p_options = PipelineOptions(pipeline_args)
    google_cloud_options = p_options.view_as(GoogleCloudOptions)
    google_cloud_options.region = 'europe-west1'
    google_cloud_options.project = 'smartlive'
    '''google_cloud_options.job_name = 'dataflow-job-{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d%H%M%S")
    )'''
    google_cloud_options.staging_location = 'gs://rim-bucket/binaries'
    google_cloud_options.temp_location = 'gs://rim-bucket/temp'
    # Local execution we set the runner as "DirectRunner"
    # Cloud execution we set the runner as "DataflowRunner"
    p_options.view_as(StandardOptions).runner = 'DirectRunner'
    p_options.view_as(SetupOptions).save_main_session = True
    p_options.view_as(StandardOptions).streaming = True
    p_options.view_as(WorkerOptions).subnetwork = (
        'regions/europe-west1/subnetworks/test')
    p = beam.Pipeline(options=p_options)

    # Read from PubSub into a PCollection.
    lines = p | 'receive_data' >> beam.io.ReadFromPubSub(
        subscription=known_args.in_topic).with_input_types(str) \
        | 'decode' >> beam.Map(lambda x: x.decode('utf-8')) \
        | 'jsonload' >> beam.Map(lambda x: json.loads(x))

    # -------------------window glissant ------------------- #

    lines| 'timestamp' >> beam.Map(get_timestamp) \
        | 'window' >>  beam.WindowInto(window.SlidingWindows(20, 10)) \
        | 'Count' >> beam.CombineGlobally(beam.combiners.CountCombineFn()).without_defaults()\
        | 'printnbrarticles' >> beam.ParDo(PrintFn())

    lines | 'jsondumps' >> beam.Map(lambda x: json.dumps(x)) \
        | 'encode' >> beam.Map(lambda x: x.encode('utf-8')) \
        | 'send_to_Pub/Sub' >> beam.io.WriteToPubSub(known_args.out_topic)

    p.run().wait_until_finish()
Esempio n. 6
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub message based on its
         # publish timestamp.
         | "Window into Sessions"
         #>> beam.WindowInto(window.Sessions(self.gap_size))
         >> beam.WindowInto(window.SlidingWindows(10, 5))
         | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
         | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val))
Esempio n. 7
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument(
            '--project',
            help=('Google Cloud Project ID'),
            required=True)
    parser.add_argument(
            '--input_topic',
            help=('Google Cloud PubSub topic name '),
            required=True)

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(
        pipeline_args.append('--project={}'.format(known_args.project)))
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=pipeline_options)

    TOPIC = 'projects/{}/topics/{}'.format(known_args.project,
                                           known_args.input_topic)
    # this table needs to exist
    table_spec = '{}:taxifare.traffic_realtime'.format(known_args.project)

    def to_bq_format(count):
        """BigQuery writer requires rows to be stored as python dictionary"""
        return {'trips_last_5min': count,
                'time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

    pipeline = (p
                | 'read_from_pubsub' >> beam.io.ReadFromPubSub(topic=TOPIC).with_output_types(bytes)
                | 'window' >> beam.WindowInto(window.SlidingWindows(
                    size=300,
                    period=15))
                | 'count' >> beam.CombineGlobally(CountFn()).without_defaults()
                | 'format_for_bq' >> beam.Map(to_bq_format)
                | 'write_to_bq' >> beam.io.WriteToBigQuery(
                    table_spec,
                    # WRITE_TRUNCATE not supported for streaming
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                    create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER)
                )

    result = p.run()
Esempio n. 8
0
 def test_setting_sliding_windows(self):
   with TestPipeline() as p:
     unkeyed_items = p | beam.Create([2, 16, 23])
     items = (unkeyed_items
              | 'key' >> beam.Map(
                  lambda x: beam.window.TimestampedValue(('k', x), x)))
     # [START setting_sliding_windows]
     from apache_beam import window
     sliding_windowed_items = (
         items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
     # [END setting_sliding_windows]
     summed = (sliding_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     assert_that(unkeyed,
                 equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
Esempio n. 9
0
def load(events, metadata=None):
    return (
        events
        | nexmark_query_util.JustBids()
        | 'query5_sliding_window' >> beam.WindowInto(
            window.SlidingWindows(metadata.get('window_size_sec'),
                                  metadata.get('window_period_sec')))
        # project out only the auction id for each bid
        | 'extract_bid_auction' >> beam.Map(lambda bid: bid.auction)
        | 'bid_count_per_auction' >> beam.combiners.Count.PerElement()
        | 'bid_max_count' >> beam.CombineGlobally(
            MostBidCombineFn()).without_defaults()
        # TODO(leiyiz): fanout with sliding window produces duplicated results,
        #   uncomment after it is fixed [BEAM-10617]
        # .with_fanout(metadata.get('fanout'))
        | beam.FlatMap(lambda auc_count: [{
            ResultNames.AUCTION_ID: auction,
            ResultNames.NUM: auc_count[1]
        } for auction in auc_count[0]]))
Esempio n. 10
0
def run(input_topic, output_topic, pipeline_args=None):
    pipeline_options = PipelineOptions(pipeline_args,
                                       streaming=True,
                                       save_main_session=True)

    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | 'Read PubSub Messages' >> beam.io.ReadFromPubSub(topic=input_topic)
         # | 'ReadText' >> beam.io.ReadFromText("data.json")
         | 'ParseJSON' >> beam.ParDo(ParseData())
         | 'AddEventTs' >> beam.ParDo(AddTimeStampFn())
         | 'Windowing' >> beam.WindowInto(
             window.SlidingWindows(size=120, period=60),
             allowed_lateness=Duration(seconds=60))
         | 'GroupByKey' >> beam.GroupByKey()
         | 'CalculateCorrelation' >> beam.ParDo(CalculateCorrelation())
         | 'FilterCorrelation' >>
         beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD)
         | 'PublishCorrelation' >> beam.io.WriteToPubSub(output_topic))
Esempio n. 11
0
def load(events, metadata=None):
    # find winning bids for each closed auction
    all_winning_bids = (events
                        | beam.Filter(nexmark_query_util.auction_or_bid)
                        | winning_bids.WinningBids())
    return (
        all_winning_bids
        # key winning bids by auction category
        |
        beam.Map(lambda auc_bid: (auc_bid.auction.category, auc_bid.bid.price))
        # re-window for sliding average
        | beam.WindowInto(
            window.SlidingWindows(metadata.get('window_size_sec'),
                                  metadata.get('window_period_sec')))
        # average for each category
        | beam.CombinePerKey(beam.combiners.MeanCombineFn())
        # TODO(leiyiz): fanout with sliding window produces duplicated results,
        #   uncomment after it is fixed [BEAM-10617]
        # .with_hot_key_fanout(metadata.get('fanout'))
        # produce output
        | beam.ParDo(ProjectToCategoryPriceFn()))
def movingAverageOf(p, project, event, speed_up_factor):
    averagingInterval = 3600 / speed_up_factor
    averagingFrequency = averagingInterval / 2
    topic = "projects/{}/topics/{}".format(project, event)
    eventType = FieldNumberLookup.create(event)

    flights = (
        p
        | "{}:read".format(event) >> beam.io.ReadFromPubSub(topic=topic)
        | "{}:window".format(event) >> beam.WindowInto(
            window.SlidingWindows(averagingInterval, averagingFrequency))
        | "{}:parse".format(event) >>
        beam.Map(lambda elm: Flight(elm.decode("utf-8").split(","), eventType))
    )

    stats = {}

    stats["delay"] = (
        flights
        | "{}:airport_delay".format(event) >>
        beam.Map(lambda elm: (elm.airport, elm.delay))
        | "{}:avgdelay".format(event) >> beam.combiners.Mean.PerKey())

    stats["timestamp"] = (flights
                          | "{}:timestamps".format(event) >>
                          beam.Map(lambda elm: (elm.airport, elm.timestamp))
                          | "{}:lastTimeStamp".format(event) >>
                          beam.CombinePerKey(lambda elem: max(elem)))

    stats["num_flights"] = (
        flights
        |
        "{}:numflights".format(event) >> beam.Map(lambda elm: (elm.airport, 1))
        | "{}:total".format(event) >> beam.combiners.Count.PerKey())

    return stats
Esempio n. 13
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_mode',
                        default='file',
                        help='Streaming input or file based batch input')

    for ticker in TICKER_LIST:
        parser.add_argument(
            '--input_{}'.format(ticker),
            default='{}_hist.csv'.format(ticker),
            help=
            'Cloud Pub/Sub topic of tick market data for a stock, fall back to flat csv'
        )

    parser.add_argument(
        '--output_topic',
        default='/tmp/trading_signals.txt',
        help='Topic of output trading signals in Google Cloud Pub/Sub')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    if known_args.input_mode == 'stream':
        pipeline_options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:

        # Read input
        input_stage = {}
        for ticker in TICKER_LIST:
            if known_args.input_mode == 'streaming':
                input_ticks = (p | beam.io.ReadFromPubSub(
                    topic=known_args.input_topic).with_output_types(
                        six.binary_type))
            else:
                input_ticks = (p | 'Read: %s' % ticker >> ReadFromText(
                    getattr(known_args, 'input_%s' % ticker)))

            input_stage[ticker] = (
                input_ticks
                |
                'decode: %s' % ticker >> beam.Map(lambda x: x.decode('utf-8'))
                | 'Filter: %s' % ticker >>
                beam.Filter(lambda row: row.split(',')[0] != 'date')
                |
                'Add Timestamp: %s' % ticker >> beam.ParDo(AddTimestampDoFn())
                | 'Window: %s' % ticker >> beam.WindowInto(
                    window.SlidingWindows(size=SECONDS_IN_1_DAY * 10,
                                          period=SECONDS_IN_1_DAY))
                |
                'Pair: %s' % ticker >> beam.ParDo(CorrelationPairDoFn(ticker)))

        # Group together all entries under the same ticker
        grouped = input_stage | 'group_by_name' >> beam.CoGroupByKey()

        correlations = (grouped
                        | 'Calculate pair correlation' >>
                        beam.Map(calculate_correlation_pair))

        if known_args.input_mode == 'stream':
            trading_signals = (
                correlations | 'Filter correlation threshold' >> beam.Filter(
                    lambda x: x[1] < CORRELATION_THRESHOLD).with_output_types(
                        six.binary_type))
            # pylint: disable=expression-not-assigned
            trading_signals | beam.io.WriteToPubSub(known_args.output_topic)
        else:
            trading_signals = (
                correlations | 'Filter correlation threshold' >>
                beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD))
            # pylint: disable=expression-not-assigned
            trading_signals | 'WriteOutput' >> WriteToText(
                known_args.output_topic)
Esempio n. 14
0
            'gamma': 1.2
        }

        model = xgb.train(best_params,
                          dtrain,
                          num_boost_round=1000,
                          evals=watchlist,
                          evals_result=evals_result,
                          verbose_eval=True)

        test.loc[:, "predict"] = model.predict(dtest)

        return test[["shop_id", "date", "predict",
                     "sales"]].to_dict(orient='records')

    (pipeline
     | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query))
     | "Assign time" >> beam.Map(assign_timevalue)
     | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1))
     | "Set group key" >> beam.Map(lambda v: ('shop_id', v))
     | beam.GroupByKey()
     | "Learn and predict" >> beam.FlatMap(learn_predict)
     | "Write data" >> beam.Write(
         beam.io.BigQuerySink(
             'dataset.table',
             schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER",
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)))

    pipeline.run()
def run(argv=None, save_main_session=True):
    # main function for running the pipeline
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--input_topic',
                            dest='input_topic',
                            required=True,
                            help=('Input PubSub topic of the form '
                                  '"projects/<PROJECT>/topics/<TOPIC>".'))
    arg_parser.add_argument(
        '--output_load_table_suffix',
        dest='output_l',
        required=True,
        help=('Output BQ table to write results to (suffix). ' +
              '"[datasetID].[tableID]" ' +
              'Since we have 8 buildings, each building ' +
              'will be loaded on the corresponding table.' +
              'ex) given argument, "energy.building", ' +
              'building 1\'s data will be loaded in energy.building1'))
    arg_parser.add_argument(
        '--output_stream_table',
        dest='output_s',
        required=True,
        help='Output BQ table to write results to. "[datasetID].[tableID]"')
    arg_parser.add_argument(
        '--output_topic',
        dest='output_topic',
        required=True,
        help=('Output PubSub topic of the form ' +
              '"projects/<PROJECT>/topics/<TOPIC>".' +
              'ex) "projects/building-energy-consumption/' +
              'topics/energy_stream"'))
    arg_parser.add_argument(
        '--speedFactor',
        dest='speedFactor',
        required=False,
        default=300,
        type=int,
        help=('How wide do you want your window (in seconds) ' +
              '(Ex) 3600 => 1 hr window'))

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    known_args, pipeline_args = arg_parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    p = beam.Pipeline(options=options)

    # Require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        arg_parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    # Use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    rowToBQ = BQTranslateTransformation()

    # ingest pubsub messages, extract data, and save to lines
    # so it can be used by both batch ingest and stream aggregations
    lines = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(
        topic=known_args.input_topic).with_output_types(bytes)
             |
             'ConvertFromBytesToStr' >> beam.Map(lambda b: b.decode('utf-8')))

    # Convert row of str to BQ rows, and load batch data to table
    # on a daily basis by setting batch_size to rows per day.
    # batch_size is a number of rows to be written to BQ
    # per streaming API insert.
    rows = (lines | 'StringToBigQueryRowLoad' >>
            beam.Map(lambda s: rowToBQ.parse_method_load(s)))

    # load_schema taken from json file extracted from processCSV.py
    # In a realistic scenario, you won't be able to automate it like this,
    # but probably have to manually insert schema
    load_schema = rowToBQ.schemas

    # filter and load to 8 tables based off of the given table suffix argument
    load1 = (
        rows | 'FilterBuilding1' >>
        beam.Filter(lambda row: int(row['building_id']) == 1)
        |
        'B1BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '1',
                                              schema=load_schema[0],
                                              batch_size=ROWS_PER_DAY))
    load2 = (
        rows | 'FilterBuilding2' >>
        beam.Filter(lambda row: int(row['building_id']) == 2)
        |
        'B2BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '2',
                                              schema=load_schema[1],
                                              batch_size=ROWS_PER_DAY))
    load3 = (
        rows | 'FilterBuilding3' >>
        beam.Filter(lambda row: int(row['building_id']) == 3)
        |
        'B3BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '3',
                                              schema=load_schema[2],
                                              batch_size=ROWS_PER_DAY))
    load4 = (
        rows | 'FilterBuilding4' >>
        beam.Filter(lambda row: int(row['building_id']) == 4)
        |
        'B4BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '4',
                                              schema=load_schema[3],
                                              batch_size=ROWS_PER_DAY))
    load5 = (
        rows | 'FilterBuilding5' >>
        beam.Filter(lambda row: int(row['building_id']) == 5)
        |
        'B5BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '5',
                                              schema=load_schema[4],
                                              batch_size=ROWS_PER_DAY))
    load6 = (
        rows | 'FilterBuilding6' >>
        beam.Filter(lambda row: int(row['building_id']) == 6)
        |
        'B6BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '6',
                                              schema=load_schema[5],
                                              batch_size=ROWS_PER_DAY))
    load7 = (
        rows | 'FilterBuilding7' >>
        beam.Filter(lambda row: int(row['building_id']) == 7)
        |
        'B7BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '7',
                                              schema=load_schema[6],
                                              batch_size=ROWS_PER_DAY))
    load8 = (
        rows | 'FilterBuilding8' >>
        beam.Filter(lambda row: int(row['building_id']) == 8)
        |
        'B8BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '8',
                                              schema=load_schema[7],
                                              batch_size=ROWS_PER_DAY))

    # stream aggregation pipeline; saved to avgs
    # to be used for writing to BigQuery and publishing to Pubsub
    # fixed window of 1 hour, adjusted according to speedFactor
    window_size = known_args.speedFactor
    avgs = (lines | 'SetTimeWindow' >> beam.WindowInto(
        window.SlidingWindows(window_size,
                              float(window_size) / 2))
            | 'ByBuilding' >> beam.ParDo(KVSplitDoFn())
            | 'GetAvgByBuilding' >> Mean.PerKey()
            |
            'AddWindowStartTimestamp' >> beam.ParDo(WindowStartTimestampFn()))

    # Convert row of str to BigQuery rows, and append to the BQ table.
    (avgs | 'StrToBigQueryRowStream' >>
     beam.Map(lambda s: rowToBQ.parse_method_stream(s))
     | 'WriteToBigQueryStream' >> beam.io.WriteToBigQuery(
         table=known_args.output_s,
         schema=rowToBQ.stream_schema,
         project=options.view_as(GoogleCloudOptions).project))

    # write message to pubsub with a different output_topic
    # for users to subscribe to and retrieve real time analysis data
    (avgs |
     'Encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)
     |
     'PublishToPubSub' >> beam.io.WriteToPubSub('projects/{}/topics/{}'.format(
         options.view_as(GoogleCloudOptions).project, known_args.output_topic))
     )
    # nothing will run until this command
    p.run()
def run(argv=None, save_main_session=True):
    '''Build and run the pipeline.'''
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--input_topic',
                            dest='input_topic',
                            required=True,
                            help=('Input PubSub topic of the form '
                                  '"projects/<PROJECT>/topics/<TOPIC>".'))
    arg_parser.add_argument(
        '--output_load_table_suffix',
        dest='output_l',
        required=True,
        help=
        ('Output BQ table to write results to (suffix). "[datasetID].[tableID]".'
         + 'Since we have 8 buildings, each building ' +
         'will be loaded on the corresponding table. ex) given argument, "energy.building" '
         + 'building 1\'s data will be loaded in energy.building1 '))
    arg_parser.add_argument(
        '--output_stream_table',
        dest='output_s',
        required=True,
        help='Output BQ table to write results to. "[datasetID].[tableID]"')
    arg_parser.add_argument(
        '--output_topic',
        dest='output_topic',
        required=True,
        help=('Output PubSub topic of the form ' +
              '"projects/<PROJECT>/topics/<TOPIC>".' +
              'ex) "projects/building-energy-consumption/' +
              'topics/energy_stream"'))
    arg_parser.add_argument(
        '--speedFactor',
        dest='speedFactor',
        required=False,
        default=300,
        help=('How wide do you want your window (in seconds) ' +
              '(Ex) 3600 => 1 hr window'))

    known_args, pipeline_args = arg_parser.parse_known_args(argv)
    #logging.info('parsed args: {}'.format(known_args))
    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information like where Dataflow should
    # store temp files, and what the project id is.
    options = PipelineOptions(pipeline_args)
    p = beam.Pipeline(options=options)
    # schema = parse_table_schema_from_json(data_ingestion.schema_str)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        arg_parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    rowToBQ = BQTranslateTransformation()

    # with open(SCHEMA_PATH) as bq_schema_file:
    #     load_schema = json.load(load_schema_file)
    #     stream_schema = json.load(load_schema_file)
    ''' 
    if new columns need to be added, add by
    [SCHEMATYPE]_schema['fields'].append({
        'name': [FIELDNAME],
        'type': [FIELDTYPE],
        'mode': [FIELDMODE],
    })
    '''

    # ingest pubsub messages, extract data, and save to lines
    # so it can be used by both batch ingest and stream aggregations
    lines = (p
             | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(
                 topic=known_args.input_topic).with_output_types(bytes)
             |
             'ConvertFromBytesToStr' >> beam.Map(lambda b: b.decode('utf-8')))

    # split to streaming inserts and batch load
    # because load is free and stream inserts costs money by size of data

    # Convert row of str to BQ rows, and load batch data to table on a daily basis
    # Set batch_size to rows per day to load sensor data in BQ on a daily basis
    # batch_size is a number of rows to be written to BQ per streaming API insert.
    rows = (lines | 'StringToBigQueryRowLoad' >>
            beam.Map(lambda s: rowToBQ.parse_method_load(s)))

    # load_schema taken from json file extracted from processCSV.py
    # In a realistic scenario, you won't be able to automate it like this,
    # but probably have to manually insert schema
    load_schema = rowToBQ.schemas

    # filter and load into 8 tables based off of the given table suffix argument
    load1 = (
        rows | 'FilterBuilding1' >>
        beam.Filter(lambda row: int(row['building_id']) == 1)
        |
        'B1BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '1',
                                              schema=load_schema[0],
                                              batch_size=ROWS_PER_DAY))
    load2 = (
        rows | 'FilterBuilding2' >>
        beam.Filter(lambda row: int(row['building_id']) == 2)
        |
        'B2BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '2',
                                              schema=load_schema[1],
                                              batch_size=ROWS_PER_DAY))
    load3 = (
        rows | 'FilterBuilding3' >>
        beam.Filter(lambda row: int(row['building_id']) == 3)
        |
        'B3BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '3',
                                              schema=load_schema[2],
                                              batch_size=ROWS_PER_DAY))
    load4 = (
        rows | 'FilterBuilding4' >>
        beam.Filter(lambda row: int(row['building_id']) == 4)
        |
        'B4BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '4',
                                              schema=load_schema[3],
                                              batch_size=ROWS_PER_DAY))
    load5 = (
        rows | 'FilterBuilding5' >>
        beam.Filter(lambda row: int(row['building_id']) == 5)
        |
        'B5BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '5',
                                              schema=load_schema[4],
                                              batch_size=ROWS_PER_DAY))
    load6 = (
        rows | 'FilterBuilding6' >>
        beam.Filter(lambda row: int(row['building_id']) == 6)
        |
        'B6BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '6',
                                              schema=load_schema[5],
                                              batch_size=ROWS_PER_DAY))
    load7 = (
        rows | 'FilterBuilding7' >>
        beam.Filter(lambda row: int(row['building_id']) == 7)
        |
        'B7BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '7',
                                              schema=load_schema[6],
                                              batch_size=ROWS_PER_DAY))
    load8 = (
        rows | 'FilterBuilding8' >>
        beam.Filter(lambda row: int(row['building_id']) == 8)
        |
        'B8BQLoad' >> beam.io.WriteToBigQuery(table=known_args.output_l + '8',
                                              schema=load_schema[7],
                                              batch_size=ROWS_PER_DAY))

    # stream aggregation pipeline; saved to avgs
    # to be used for writing to BigQuery and publishing to Pubsub
    # fixed window of 1 hour, adjusted according to speedFactor
    window_size = known_args.speedFactor
    avgs = (
        lines
        #  | 'AddEventTimestamps' >> beam.Map(lambda s: window.TimestampedValue(s,
        #                             time.mktime(dateutil.parser.parse(s.split(',')[0]).timetuple())))
        #  | 'AddEventTimestamps' >>  beam.ParDo(AddTimestampDoFn())
        # | 'SetTimeWindow' >> beam.WindowInto(window.SlidingWindows(WINDOW_SIZE, WINDOW_PERIOD, offset=0))
        # sliding window of [window_size] seconds, starting every [window_size/2] seconds
        | 'SetTimeWindow' >> beam.WindowInto(
            window.SlidingWindows(window_size,
                                  float(window_size) / 2))
        # splitting to k,v of buildingId (2nd column), general meter reading (3rd column)
        # TODO: currently, groupbykey not working.. or the window is too wide that i have to wait a long time?
        #  | 'ByBuilding' >> beam.Map(lambda s: (s.split(',')[1], int(float(s.split(',')[2]))))
        | 'ByBuilding' >> beam.ParDo(KVSplitDoFn())
        | 'GetAvgByBuilding' >> Mean.PerKey()
        #  | 'CountByBuilding' >> Count.PerKey())
        | 'AddWindowStartTimestamp' >> beam.ParDo(WindowStartTimestampFn()))

    # Convert row of str to BigQuery rows, and append to the BQ table.
    (avgs | 'StrToBigQueryRowStream' >>
     beam.Map(lambda s: rowToBQ.parse_method_stream(s))
     | 'WriteToBigQueryStream' >> beam.io.WriteToBigQuery(
         table=known_args.output_s,
         schema=rowToBQ.stream_schema,
         project=options.view_as(GoogleCloudOptions).project))

    # write message to pubsub with a different output_topic
    # for users to subscribe to and retrieve real time analysis data
    (avgs |
     'Encode' >> beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes)
     |
     'PublishToPubSub' >> beam.io.WriteToPubSub('projects/{}/topics/{}'.format(
         options.view_as(GoogleCloudOptions).project, known_args.output_topic))
     )

    p.run()
Esempio n. 17
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        lines = p | beam.io.ReadStringsFromPubSub(
            subscription=known_args.input_subscription)
    else:
        lines = p | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (
        lines
        #   | 'print1' >> beam.Map(print)
        | 'split' >> beam.Map(Extracting_X_Value)
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.SlidingWindows(10, 1, 0))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones))

    # Branch 1: Alert when x hits -1.0 = 10 x times, by writing a message to PubSub
    alert = (counts | 'filter' >> beam.ParDo(Alerting_X_Value())
             | 'write_to_pubsub' >> beam.io.WriteStringsToPubSub(
                 known_args.output_topic))

    # Branch 2: Print out the output
    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '[{}] {}: {}'.format(time.ctime(), word, count)

    output = (counts | 'format' >> beam.Map(format_result)
              | 'print' >> beam.Map(print))

    result = p.run()
    result.wait_until_finish()