Example #1
0
def load(events, metadata=None, pipeline_options=None):
  return (
      events
      | 'query10_shard_events' >> beam.ParDo(ShardEventsDoFn())
      # trigger fires when each sub-triger (executed in order) fires
      # repeatedly 1. after at least maxLogEvents in pane
      #            2. or finally when watermark pass the end of window
      # Repeatedly 1. after at least maxLogEvents in pane
      #            2. or processing time pass the first element in pane + delay
      | 'query10_fix_window' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          trigger=trigger.AfterEach(
              trigger.OrFinally(
                  trigger.Repeatedly(
                      trigger.AfterCount(metadata.get('max_log_events'))),
                  trigger.AfterWatermark()),
              trigger.Repeatedly(
                  trigger.AfterAny(
                      trigger.AfterCount(metadata.get('max_log_events')),
                      trigger.AfterProcessingTime(LATE_BATCHING_PERIOD)))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          # Use a 1 day allowed lateness so that any forgotten hold will stall
          # the pipeline for that period and be very noticeable.
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk' >> beam.GroupByKey()
      | 'query10_write_event' >> beam.ParDo(WriteEventDoFn(), pipeline_options)
      | 'query10_window_log_files' >> beam.WindowInto(
          window.FixedWindows(metadata.get('window_size_sec')),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=Duration.of(1 * 24 * 60 * 60))
      | 'query10_gbk_2' >> beam.GroupByKey()
      | 'query10_write_index' >> beam.ParDo(WriteIndexDoFn(), pipeline_options))
Example #2
0
def load(events, metadata=None, pipeline_options=None):
  num_events_in_pane = 30
  windowed_events = (
      events
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(trigger.AfterCount(num_events_in_pane)),
          accumulation_mode=trigger.AccumulationMode.DISCARDING))
  auction_by_seller_id = (
      windowed_events
      | nexmark_query_util.JustAuctions()
      | 'query3_filter_category' >> beam.Filter(lambda auc: auc.category == 10)
      | 'query3_key_by_seller' >> beam.ParDo(
          nexmark_query_util.AuctionBySellerFn()))
  person_by_id = (
      windowed_events
      | nexmark_query_util.JustPerson()
      | 'query3_filter_region' >>
      beam.Filter(lambda person: person.state in ['OR', 'ID', 'CA'])
      | 'query3_key_by_person_id' >> beam.ParDo(
          nexmark_query_util.PersonByIdFn()))
  return ({
      nexmark_query_util.AUCTION_TAG: auction_by_seller_id,
      nexmark_query_util.PERSON_TAG: person_by_id,
  }
          | beam.CoGroupByKey()
          | 'query3_join' >> beam.ParDo(
              JoinFn(metadata.get('max_auction_waiting_time')))
          | 'query3_output' >> beam.Map(
              lambda t: {
                  ResultNames.NAME: t[1].name,
                  ResultNames.CITY: t[1].city,
                  ResultNames.STATE: t[1].state,
                  ResultNames.AUCTION_ID: t[0].id
              }))
    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        #
        # In the case of dynamic sharding, however, we use a default trigger since
        # the transform performs sharding also batches elements to avoid generating
        # too many tiny files. User trigger is applied right after writes to limit
        # the number of load jobs.
        if self.is_streaming_pipeline and not self.with_auto_sharding:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
 def expand(self, pcoll):
     return (pcoll
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
 def expand(self, pcoll):
     return (pcoll
             | 'TweetGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(50)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                 allowed_lateness=self.allowed_lateness_seconds)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractTweets' >> ExtractTweets('user_id'))
Example #6
0
 def expand(self, pcoll):
     return (pcoll
             # Get periodic results every ten events.
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                 allowed_lateness=self.allowed_lateness_seconds)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
Example #7
0
  def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
    outputs = (
        destination_data_kv_pc
        | beam.ParDo(
            WriteRecordsToFile(
                schema=self.schema,
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs).with_outputs(
                WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                WriteRecordsToFile.WRITTEN_FILE_TAG))

    # A PCollection of (destination, file) tuples. It lists files with records,
    # and the destination each file is meant to be imported into.
    destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

    # A PCollection of (destination, record) tuples. These are later sharded,
    # grouped, and all records for each destination-shard is written to files.
    # This PCollection is necessary because not all records can be written into
    # files in ``WriteRecordsToFile``.
    unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

    more_destination_files_kv_pc = (
        unwritten_records_pc
        | beam.ParDo(_ShardDestinations())
        | "GroupShardedRows" >> beam.GroupByKey()
        | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
        | "WriteGroupedRecordsToFile" >> beam.ParDo(
            WriteGroupedRecordsToFile(
                schema=self.schema, file_format=self._temp_file_format),
            file_prefix_pcv,
            *self.schema_side_inputs))

    # TODO(BEAM-9494): Remove the identity transform. We flatten both
    # PCollection paths and use an identity function to work around a
    # flatten optimization issue where the wrong coder is being used.
    all_destination_file_pairs_pc = (
        (destination_files_kv_pc, more_destination_files_kv_pc)
        | "DestinationFilesUnion" >> beam.Flatten()
        | "IdentityWorkaround" >> beam.Map(lambda x: x))

    if self.is_streaming_pipeline:
      # Apply the user's trigger back before we start triggering load jobs
      all_destination_file_pairs_pc = (
          all_destination_file_pairs_pc
          | "ApplyUserTrigger" >> beam.WindowInto(
              beam.window.GlobalWindows(),
              trigger=trigger.Repeatedly(
                  trigger.AfterAll(
                      trigger.AfterProcessingTime(self.triggering_frequency),
                      trigger.AfterCount(1))),
              accumulation_mode=trigger.AccumulationMode.DISCARDING))
    return all_destination_file_pairs_pc
    def expand(self, pcoll):
        logging.info("Calculate user values: {}".format(pcoll))

        return (pcoll
                # Get periodic results every ten events.
                | 'HighValueUserGlobalWindows' >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
                # Extract and sum username/value pairs from the event data.
                | 'ExtractAndSumValue' >> ExtractAndSumValue('user'))
 def expand(self, pcoll):
     # NOTE: the behavior does not exactly match the Java example
     # TODO: allowed_lateness not implemented yet in FixedWindows
     # TODO: AfterProcessingTime not implemented yet, replace AfterCount
     return (pcoll
             # Get periodic results every ten events.
             | 'LeaderboardUserGlobalWindows' >> beam.WindowInto(
                 beam.window.GlobalWindows(),
                 trigger=trigger.Repeatedly(trigger.AfterCount(10)),
                 accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
             # Extract and sum username/score pairs from the event data.
             | 'ExtractAndSumScore' >> ExtractAndSumScore('user'))
 def _maybe_apply_user_trigger(self, destination_file_kv_pc):
     if self.is_streaming_pipeline:
         # Apply the user's trigger back before we start triggering load jobs
         return (destination_file_kv_pc
                 | "ApplyUserTrigger" >> beam.WindowInto(
                     beam.window.GlobalWindows(),
                     trigger=trigger.Repeatedly(
                         trigger.AfterAll(
                             trigger.AfterProcessingTime(
                                 self.triggering_frequency),
                             trigger.AfterCount(1))),
                     accumulation_mode=trigger.AccumulationMode.DISCARDING))
     else:
         return destination_file_kv_pc
Example #11
0
def load(events, metadata=None):
  return (
      events
      | nexmark_query_util.JustBids()
      | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
      # windowing with processing time trigger, currently not supported in batch
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(
              trigger.AfterProcessingTime(metadata.get('window_size_sec'))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=0)
      | 'query12_bid_count' >> beam.combiners.Count.PerElement()
      | 'query12_output' >> beam.Map(
          lambda t: {
              ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1]
          }))
Example #12
0
    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
        outputs = (destination_data_kv_pc
                   | beam.ParDo(WriteRecordsToFile(
                       max_files_per_bundle=self.max_files_per_bundle,
                       max_file_size=self.max_file_size,
                       coder=self.coder),
                                file_prefix=file_prefix_pcv).with_outputs(
                                    WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                    WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        if self.is_streaming_pipeline:
            # Apply the user's trigger back before we start triggering load jobs
            all_destination_file_pairs_pc = (
                all_destination_file_pairs_pc
                | "ApplyUserTrigger" >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(
                        trigger.AfterAll(
                            trigger.AfterProcessingTime(
                                self.triggering_frequency),
                            trigger.AfterCount(1))),
                    accumulation_mode=trigger.AccumulationMode.DISCARDING))
        return all_destination_file_pairs_pc
Example #13
0
def load(events, metadata=None):
    # find winning bids for each closed auction
    return (events
            # find winning bids
            | beam.Filter(nexmark_query_util.auction_or_bid)
            | winning_bids.WinningBids()
            # (auction_bids -> (aution.seller, bid)
            | beam.Map(lambda auc_bid: (auc_bid.auction.seller, auc_bid.bid))
            # calculate and output mean as data arrives
            | beam.WindowInto(
                window.GlobalWindows(),
                trigger=trigger.Repeatedly(trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                allowed_lateness=0)
            | beam.CombinePerKey(MovingMeanSellingPriceFn(10))
            | beam.Map(lambda t: {
                ResultNames.SELLER: t[0],
                ResultNames.PRICE: t[1]
            }))
Example #14
0
 def test(self):
   _ = (
       self.pipeline
       | 'Read from pubsub' >> ReadFromPubSub(
           subscription=self.read_sub_name,
           with_attributes=True,
           id_label='id',
       )
       | beam.Map(lambda x: bytes(1)).with_output_types(bytes)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
       | 'Window' >> beam.WindowInto(
           window.GlobalWindows(),
           trigger=trigger.Repeatedly(
               trigger.AfterCount(self.num_of_messages)),
           accumulation_mode=trigger.AccumulationMode.DISCARDING)
       | 'Count messages' >> beam.CombineGlobally(
           beam.combiners.CountCombineFn()).without_defaults().
       with_output_types(int)
       | 'Convert to bytes' >>
       beam.Map(lambda count: str(count).encode('utf-8'))
       | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
Example #15
0
    def _window_fn(self):
        """Set the correct WindowInto PTransform"""

        # The user-supplied triggering_frequency is often chosen to control how
        # many BigQuery load jobs are triggered, to prevent going over BigQuery's
        # daily quota for load jobs. If this is set to a large value, currently we
        # have to buffer all the data until the trigger fires. Instead we ensure
        # that the files are written if a threshold number of records are ready.
        # We use only the user-supplied trigger on the actual BigQuery load.
        # This allows us to offload the data to the filesystem.
        if self.is_streaming_pipeline:
            return beam.WindowInto(beam.window.GlobalWindows(),
                                   trigger=trigger.Repeatedly(
                                       trigger.AfterAny(
                                           trigger.AfterProcessingTime(
                                               self.triggering_frequency),
                                           trigger.AfterCount(
                                               _FILE_TRIGGERING_RECORD_COUNT))),
                                   accumulation_mode=trigger.AccumulationMode\
                                       .DISCARDING)
        else:
            return beam.WindowInto(beam.window.GlobalWindows())
def run():
    options = PipelineOptions([
        "--runner=PortableRunner", "--job_endpoint=localhost:8099",
        "--environment_type=LOOPBACK"
    ])
    # options = PipelineOptions([
    #     "--runner=FlinkRunner",
    #     "--flink_master=localhost:8081",
    # ])
    with beam.Pipeline(options=options) as p:
        (p | 'ReadFromKafka' >> ReadFromKafka(
            consumer_config={"bootstrap.servers": "localhost:9092"},
            topics=["beam-input"])
         | 'ExtractWords' >>
         beam.FlatMap(lambda kv: re.findall(r'[A-Za-z\']+', kv[1]))
         | 'Window' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.Repeatedly(trigger.AfterCount(1)),
             accumulation_mode=AccumulationMode.ACCUMULATING)
         | 'Count' >> beam.combiners.Count.PerElement()
         | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                                (word_count[0], word_count[1]))
         | 'Log' >> beam.ParDo(LoggingDoFn()))
def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p | 'ReadFromKafka' >> ReadFromKafka(
        consumer_config={"bootstrap.servers": "localhost:9092"},
        topics=["beam-input"])
     | 'ExtractWords' >> beam.FlatMap(lambda
                                      (k, v): re.findall(r'[A-Za-z\']+', v))
     | 'Window' >> beam.WindowInto(
         window.GlobalWindows(),
         trigger=trigger.Repeatedly(trigger.AfterCount(1)),
         accumulation_mode=AccumulationMode.ACCUMULATING)
     | 'Count' >> beam.combiners.Count.PerElement()
     | 'Format' >> beam.Map(lambda word_count: '%s: %s' %
                            (word_count[0], word_count[1]))
     | 'Log' >> beam.ParDo(LoggingDoFn()))

    result = p.run()
    result.wait_until_finish()
def main(argv=None):
    def json_parser(x):
        parsed = json.loads(x)
        return parsed

    def bye(x):
        logging.info('outing: %s', x)
        return x

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_topic")
    parser.add_argument("--output_topic")
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    data = (p
            | 'ReadData' >>
            beam.io.ReadFromPubSub(topic=READ_TOPIC).with_output_types(bytes)
            | "JSONParse" >> beam.Map(json_parser))

    (data
     | "AddingKeyToSumUp" >> beam.WithKeys(lambda x: x["ride_id"])
     | "Windowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(2)))),
         accumulation_mode=tr.AccumulationMode.DISCARDING,
         allowed_lateness=0)
     | 'ToBytes' >> beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye' >> beam.Map(bye)
     | 'WriteToPubSub' >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "SlidWindowing" >> beam.WindowInto(
         window.FixedWindows(60),
         trigger=(tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1))),
                                    late=tr.Repeatedly(tr.AfterCount(1)))),
         allowed_lateness=300,
         accumulation_mode=tr.AccumulationMode.ACCUMULATING)
     | "Extract" >> beam.Map(lambda x: x["meter_increment"])
     | "Sum_up" >> beam.CombineGlobally(sum).without_defaults()
     | "Reformat" >> beam.Map(lambda x: {"dollar_run_rate_per_minute": x})
     | "Enrich with time data" >> beam.ParDo(Enrich())
     | "ToBytesCount" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye2' >> beam.Map(bye)
     | "WriteCount" >> beam.io.WriteToPubSub(TOPIC))

    (data
     | "AddingKey" >> beam.WithKeys(lambda x: x["ride_id"])
     | "SessionWindowing" >> beam.WindowInto(
         window.Sessions(60),
         trigger=tr.AfterWatermark(early=tr.Repeatedly(
             tr.AfterAll(tr.AfterCount(1), tr.AfterProcessingTime(1)))),
         accumulation_mode=tr.AccumulationMode.ACCUMULATING,
         allowed_lateness=0)
     | "GroupInPickup" >> beam.CombinePerKey(PickupFn())
     | "Discarding Key" >> beam.Map(lambda x: x[1])
     | "Filter not pickup" >>
     beam.Map(lambda x: x if str(x["ride_status"]) == "pickup" else None)
     | "ToBytesPickup" >>
     beam.Map(lambda x: json.dumps(x, indent=2).encode('utf-8'))
     | 'Bye3' >> beam.Map(bye)
     | "WritePickup" >> beam.io.WriteToPubSub(TOPIC))

    result = p.run()
    result.wait_until_finish()
Example #19
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic or not known_args.play_topic):
        logging.fatal('topic and play_topic are required.')

    events = (p
              | 'read_events' >> ReadFromPubSub(
                  topic=known_args.topic, timestamp_attribute='timestamp_ms')
              | 'parse_events' >> beam.ParDo(ParseEventFn()))

    play_events = (
        p
        | 'read_play_events' >> ReadFromPubSub(
            topic=known_args.play_topic, timestamp_attribute='timestamp_ms')
        | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn()))

    sessionized_events = (
        events
        | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_events' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    sessionized_plays = (
        play_events
        | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_plays' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    per_user_latency = ({
        'plays': sessionized_plays,
        'events': sessionized_events
    }
                        | 'cbk' >> beam.CoGroupByKey()
                        | 'compute_latency' >> beam.ParDo(ComputeLatency()))

    mean_latency = (
        per_user_latency
        | 'extract_latencies' >> beam.Values()
        | 'global_window' >> beam.WindowInto(
            window.GlobalWindows(),
            trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
            accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
        | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn(
        )).with_fanout(16).as_singleton_view())

    _ = (per_user_latency
         | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(),
                                            mean_latency=mean_latency)
         | 'filter_duplicates' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.AfterCount(1),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
         | 'write_bad_users' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             ('user:string')))

    p.run().wait_until_finish()
Example #20
0
def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument(
                '--input',
                default='projects/notbanana-7f869/topics/rsvps_source')
            parser.add_argument(
                '--output',
                default='projects/notbanana-7f869/topics/rsvps_out')

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'notbanana-7f869'
    google_cloud_options.staging_location = 'gs://notbanana-7f869.appspot.com/staging'
    google_cloud_options.temp_location = 'gs://notbanana-7f869.appspot.com/temp'
    google_cloud_options.job_name = 'demo-job'
    """
    -> Run the pipeline on the Cloud Dataflow runner.
    $ python pipelines/main.py --setup_file path/to/setup.py
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element))
             | 'Filter noVenue' >> beam.ParDo(FilterNoVenueEventsFn()))
        """ 
        -> Outputs the total number of events globally processed by the pipeline.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (inputs
         | 'Apply Global Window' >> beam.WindowInto(
             beam.window.GlobalWindows(),
             trigger=trigger.Repeatedly(
                 trigger.AfterAny(
                     trigger.AfterCount(2),
                     # AfterProcessingTime is experimental.
                     # Not implemented yet.
                     trigger.AfterProcessingTime(30))),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Count events globally' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish %s' % 'Events' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 10 hottest topics within a Fixed Window of X seconds. 
        Values used are for testing purposes.
        NB: Using a custom TopFn that will deduplicate k/v pairs
        when using an accumulation strategy: SO - 56616576 @guillem-xercavins
        """
        (inputs
         | 'Apply Window of time %s' % 'Topics' >> beam.WindowInto(
             beam.window.FixedWindows(size=10 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(5)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | beam.Map(lambda element: element['group'])
         | beam.ParDo(PairTopicWithOneFn())
         | beam.CombinePerKey(sum)
         | 'Top 10 Topics' >> beam.CombineGlobally(
             TopDistinctFn(
                 n=10, compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'DictFormat %s' % 'Topics' >> beam.ParDo(FormatTopTopicFn())
         | 'Publish %s' % 'Topics' >> WriteToPubSub(
             topic=output_topic, category=Category.HOT_TOPICS))
Example #21
0
def run(argv=None):
    class MyOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input', default=TW_INPUT)
            parser.add_argument('--output', default=TW_OUTPUT)

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.staging_location = STAGING_LOCATION
    google_cloud_options.temp_location = TEMP_LOCATION
    google_cloud_options.flexrs_goal = 'COST_OPTIMIZED'
    # google_cloud_options.job_name = 'hashtags-battle-job'
    """
    -> Uncomment this to run the pipeline on the Cloud Dataflow runner.
    $ python main.py --setup_file ./setup.py --machine_type=n1-standard-2 --max_num_workers=2 --disk_size_gb=30
    """
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        my_options = options.view_as(MyOptions)
        input_topic = my_options.input
        output_topic = my_options.output
        """
        -> Consumes/collects events sent by the input Pub/Sub topic.
        @: id_label argument is a unique identifier used by the pipeline to
        deduplicate events : Exactly-once semantic.
        """
        inputs = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic=input_topic,
                            # id_label='event_id'
                    ).with_output_types(six.binary_type)
             | 'Decode Binary' >> beam.Map(lambda element: element.decode('utf-8'))
             | 'Transform Json To Dict' >> beam.Map(lambda element: json.loads(element)))
        # | 'Add Event Time' >> beam.ParDo(AddTimestampFn())
        """
        -> Extracts hashtags array from object.
        """
        hashtags = \
            (inputs
             | 'Get Hashtags' >> beam.Map(lambda element: element['hashtags'])
             | 'Explode Hashtags' >> beam.FlatMap(lambda element: element))
        """
        -> Outputs a batch of pre-aggregated hashtags.
        Triggering early results from the window every X seconds (processing time trigger)
        or triggering when the current pane has collected at least N elements (data-driven trigger)
        Values used are for testing purposes.
        """
        (hashtags
         | 'Apply Daily Window' >> beam.WindowInto(
             beam.window.FixedWindows(SECONDS_IN_1_DAY),
             trigger=trigger.Repeatedly(trigger.AfterCount(10)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Grouping Hashtags' >> PairWithOneCombine()
         | 'Format Hashtags' >> beam.ParDo(FormatHashtagFn())
         | 'Batch Hashtags' >> beam.BatchElements(min_batch_size=49,
                                                  max_batch_size=50)
         | 'Publish Hashtags' >> WriteToPubSub(
             topic=output_topic, category=Category.DAILY_HASHTAGS))
        """
        -> Outputs the sum of processed events for a given fixed-time window.
        """
        (hashtags
         | 'Apply 5 Minutes' >> beam.WindowInto(
             beam.window.FixedWindows(size=5 * 60),
             trigger=trigger.Repeatedly(trigger.AfterCount(20)),
             accumulation_mode=trigger.AccumulationMode.DISCARDING)
         | 'CG+CC' >> beam.CombineGlobally(
             beam.combiners.CountCombineFn()).without_defaults()
         | 'Publish Events Sum' >> WriteToPubSub(
             topic=output_topic, category=Category.GLOBAL_EVENTS))
        """
        -> Outputs the top 5 trending hashtags within a given fixed-time window.
        """
        (hashtags
         | 'Apply %s Min FW' % '30' >> beam.WindowInto(
             beam.window.FixedWindows(size=SECONDS_IN_HALF_HOUR),
             trigger=trigger.Repeatedly(trigger.AfterCount(2)),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'Grouping Trends' >> PairWithOneCombine()
         | '%s Trending Hashtags' % TRENDING_HASHTAGS_LIMIT >>
         beam.CombineGlobally(
             TopDistinctFn(
                 n=TRENDING_HASHTAGS_LIMIT,
                 compare=lambda a, b: a[1] < b[1])).without_defaults()
         | 'Format Trending Hashtags' >> beam.ParDo(FormatHashtagsFn())
         | 'Publish Trending Hashtags' >> WriteToPubSub(
             topic=output_topic, category=Category.TRENDING_HASHTAGS))
def run():
    pipeline_options = PipelineOptions(
        ["--runner=DirectRunner", "--streaming"])
    p = beam.Pipeline(options=pipeline_options)

    # read
    topic_path = "projects/qwiklabs-gcp-34125c5e4e40e9e3/topics/pycon30-file"  # replace topic with yours
    lines = (p | 'read' >> beam.io.ReadFromPubSub(topic=topic_path,
                                                  with_attributes=True))

    # format message
    def format_message(message, timestamp=beam.DoFn.TimestampParam):
        message = json.loads(message.data)
        formatted_message = {
            'data': message.get('data'),
            'timestamp': float(message.get('event_time'))
        }
        return formatted_message

    formatted = lines | beam.Map(format_message)
    # windowed = formatted | beam.WindowInto(beam.window.FixedWindows(5))
    # windowed = formatted | beam.WindowInto(beam.window.SlidingWindows(60, 5))
    windowed = formatted | beam.WindowInto(
        beam.window.GlobalWindows(),
        trigger=trigger.Repeatedly(trigger.AfterCount(1)),
        accumulation_mode=trigger.AccumulationMode.ACCUMULATING)

    # split words
    def find_words(element):
        import re
        return re.findall(r'[A-Za-z\']+', element.get('data'))

    words = (windowed | 'split' >> (beam.FlatMap(find_words)))

    # count words
    def count_ones(word_ones):
        (word, ones) = word_ones
        return word, sum(ones)

    counts = (words
              | 'pair' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # aggr to list
    def aggr_to_list(values):
        try:
            if not values:
                return values
            elif isinstance(values, _ReiterableChain):
                return [x for x in values]
            elif len(values) == 1:
                return values[0]
            else:
                if isinstance(values[0], list):
                    return values[0] + [values[1]]
                else:
                    return [x for x in values]
        except Exception:
            print(values)
            pass

    aggred_list = counts | 'sort' >> beam.CombineGlobally(
        aggr_to_list).without_defaults()

    # out
    aggred_list | 'out' >> beam.Map(
        lambda x: logging.info(sorted(x, key=lambda x: x[1], reverse=True)))

    result = p.run()
    result.wait_until_finish()