Example #1
0
  def test_model_early_late_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements([
              'a', 'a', 'a', 'b', 'b'
          ]).add_elements([
              TimestampedValue('a', 10)
          ]).advance_watermark_to(20).advance_processing_time(60).add_elements(
              [TimestampedValue('a', 10)]))
      trigger = (
          # [START model_early_late_triggers]
          AfterWatermark(
              early=AfterProcessingTime(delay=1 * 60), late=AfterCount(1))
          # [END model_early_late_triggers]
      )
      counts = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
          | WindowInto(
              FixedWindows(15),
              trigger=trigger,
              allowed_lateness=20,
              accumulation_mode=AccumulationMode.DISCARDING)
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 4), ('b', 2), ('a', 1)]))
Example #2
0
  def test_model_setting_trigger(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements(
              ['a', 'a', 'a', 'b',
               'b']).advance_watermark_to(70).advance_processing_time(600))
      pcollection = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_setting_trigger]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=AfterProcessingTime(10 * 60),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_setting_trigger]
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2)]))
Example #3
0
    def test_multiple_accumulating_firings(self):
        # PCollection will contain elements from 1 to 10.
        elements = [i for i in range(1, 11)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([('key', str(i))])
            if i % 5 == 0:
                ts.advance_watermark_to(i)
                ts.advance_processing_time(5)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            _ = (
                p
                | ts
                | beam.WindowInto(
                    FixedWindows(10),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(
                        early=AfterAll(AfterCount(1), AfterProcessingTime(5))))
                | beam.GroupByKey()
                | beam.FlatMap(lambda x: x[1])
                | beam.ParDo(self.record_dofn()))

        # The trigger should fire twice. Once after 5 seconds, and once after 10.
        # The firings should accumulate the output.
        first_firing = [str(i) for i in elements if i <= 5]
        second_firing = [str(i) for i in elements]
        self.assertListEqual(first_firing + second_firing,
                             TriggerPipelineTest.all_records)
Example #4
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
 def expand(self, pcoll):
     return (pcoll
             | "Parse message" >> beam.ParDo(PubsubMessageParser())
             | "Windowing" >> beam.WindowInto(FixedWindows(60),
                                              trigger=AfterWatermark(
                                                  early=AfterProcessingTime(delay=20)),
                                              accumulation_mode=AccumulationMode.ACCUMULATING)
             | "WithKeys" >> beam.Map(lambda account_offer: ((account_offer['offer_id']), account_offer))
             | beam.GroupByKey()
             | 'Count distinct accounts' >> beam.ParDo(DistinctAccountCount())
             | 'Map to BQ row' >> beam.ParDo(ConvertStatToBQRow()))
def run(argv=None):
    class TemplateOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_argument('--input', default='frames.log')
            parser.add_argument('--output', default='output.txt')

    options = PipelineOptions(flags=argv)

    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    # Uncomment this to run the pipeline on the Cloud (Dataflow)
    # options.view_as(StandardOptions).runner = 'DataflowRunner'

    with beam.Pipeline(options=options) as p:
        windowed = \
            (p
             | 'Read From Pub/Sub' >> beam.io.ReadFromPubSub(
                            topic='projects/alert-shape-256811/topics/ml-flow',
                            timestamp_attribute='timestamp').with_output_types(six.binary_type)
             | 'Transform To Numpy Array' >> beam.ParDo(TransformToNumpyArrayFn())
             | beam.WindowInto(
                            window.FixedWindows(10),
                            trigger=AfterProcessingTime(5),
                            accumulation_mode=AccumulationMode.DISCARDING))

        counted = \
            (windowed
             | 'Add Default Key' >> beam.Map(lambda x: (0, 1))
             | 'Count Num Frames' >> beam.CombinePerKey(sum)
             | 'Drop Default Key' >> beam.ParDo(DropKey()))

        # a = \
        #     (windowed_frames
        #      | 'Add Window As Key' >> beam.ParDo(KeyIntoWindow()))

        # b = (windowed_frames
        #      | 'Group By Key' >> beam.GroupByKey()
        #      | 'Drop Key' >> beam.ParDo(DropKey()))

        (windowed | 'Detect Labels' >> beam.ParDo(DetectLabelsFn())
         | 'Flatten' >> beam.FlatMap(lambda x: x)
         | 'Pair With One' >> beam.Map(lambda x: (x, 1))
         | 'Group For Mean' >> beam.GroupByKey()
         | 'Mboulouté' >> beam.ParDo(ComputeMean(),
                                     beam.pvalue.AsSingleton(counted))
         # | 'Sum Label Occurrences' >> beam.CombineValues(MeanCombineFn())
         # | 'Format with Window and Timestamp' >> beam.ParDo(WindowFormatterFn())
         # | 'Publish Frames' >> beam.io.WriteToPubSub(
         #                topic='projects/alert-shape-256811/topics/ml-flow-out'))
         | 'Just Print' >> beam.Map(lambda x: logging.info(x)))
def run(argv=None):
  """Build and run the pipeline."""
  args = ["--runner=PortableRunner",
          "--job_endpoint=localhost:8099",
          "--streaming"]
  if argv:
    args.extend(argv)

  parser = argparse.ArgumentParser()
  parser.add_argument('--count',
                      dest='count',
                      default=0,
                      help='Number of triggers to generate '
                           '(0 means emit forever).')
  parser.add_argument('--interval_ms',
                      dest='interval_ms',
                      default=500,
                      help='Interval between records per parallel '
                           'Flink subtask.')

  known_args, pipeline_args = parser.parse_known_args(args)

  pipeline_options = PipelineOptions(pipeline_args)

  p = beam.Pipeline(options=pipeline_options)

  messages = (p | FlinkStreamingImpulseSource()
              .set_message_count(known_args.count)
              .set_interval_ms(known_args.interval_ms))

  _ = (messages | 'decode' >> beam.Map(lambda x: ('', 1))
       | 'window' >> beam.WindowInto(window.GlobalWindows(),
                                     trigger=Repeatedly(
                                         AfterProcessingTime(5 * 1000)),
                                     accumulation_mode=
                                     AccumulationMode.DISCARDING)
       | 'group' >> beam.GroupByKey()
       | 'count' >> beam.Map(count)
       | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1])))

  result = p.run()
  result.wait_until_finish()
Example #8
0
def run(inputTopic, outputTable, windowSize, pipelineArgs):
    """Build and run the pipeline."""
    pipeline_options = PipelineOptions(pipelineArgs,
                                       streaming=True,
                                       save_main_session=True)
    sideProperties = {'topic': inputTopic}

    with beam.Pipeline(options=pipeline_options) as p:
        # Read from PubSub into a PCollection.
        (p
         | "Read PubSub Messages" >> beam.io.ReadFromPubSub(topic=inputTopic)
         | 'ProcessMessages' >> beam.ParDo(ProcessPubSubMessage(),
                                           sideProperties)
         | 'PrintToLog' >> beam.ParDo(PrettyPrintMessage())
         | "chunk messages" >> beam.WindowInto(
             window.FixedWindows(windowSize, 0),
             trigger=AfterProcessingTime(10),
             accumulation_mode=AccumulationMode.ACCUMULATING,
             allowed_lateness=1800  # 30 minutes
         )
         | 'WriteToBigQuery' >> WriteToBigQuery(
             outputTable,
             schema={
                 'fields': [{
                     "name": "created_at",
                     "mode": "NULLABLE",
                     "type": "TIMESTAMP"
                 }, {
                     "name": "topic",
                     "type": "STRING",
                     "mode": "REQUIRED"
                 }, {
                     "name": "data",
                     "type": "STRING",
                     "mode": "NULLABLE",
                 }]
             },
             write_disposition=BigQueryDisposition.WRITE_APPEND))
Example #9
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from PubSub into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--accum_mode',
                        required=True,
                        help='Accumulation mode for pipeline')

    opts, pipeline_args = parser.parse_known_args()

    options = PipelineOptions(pipeline_args, save_main_session=True)

    options.view_as(
        GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}"
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    table_schema = {
        "fields": [
            {
                "name": "taxi_events",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
        ]
    }

    input_topic = "projects/pubsub-public-data/topics/taxirides-realtime"
    output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}"

    if opts.accum_mode == 'accumulating':
        accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING
    elif opts.accum_mode == 'discarding':
        accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING
    else:
        raise ValueError(
            'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' '
        )

    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide)
     | 'WindowByMinute' >> beam.WindowInto(
         beam.window.FixedWindows(60),
         trigger=AfterWatermark(early=AfterProcessingTime(10)),
         accumulation_mode=accum_mode)
     | "CountPerMinute" >> beam.CombineGlobally(
         CountCombineFn()).without_defaults()
     | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
     | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
         output_table,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
Example #11
0
def run(argv=None, save_main_session=True):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output_table',
        required=True,
        help=(
            'Output BigQuery table for results specified as: '
            '<project-id>:<dataset-id>.<table-id> or <dataset-id>.<table-id>'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '--input_topic',
        help=
        'Input PubSub topic of the form projects/<project-id>/topics/<topic-id>'
    )
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<project-id>/subscriptions/<subscription-id>"'))
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Read from PubSub into a PCollection.
        if known_args.input_subscription:
            messages = (p
                        | beam.io.ReadFromPubSub(
                            subscription=known_args.input_subscription).
                        with_output_types(bytes))
        else:
            messages = (
                p
                | beam.io.ReadFromPubSub(
                    topic=known_args.input_topic).with_output_types(bytes))
        output = (
            messages
            | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
            | beam.WindowInto(
                window.FixedWindows(10, 0),
                trigger=AfterProcessingTime(10),
                accumulation_mode=AccumulationMode.ACCUMULATING,
                allowed_lateness=1800  # 30 minutes
            )
            | 'Parse Json' >> beam.Map(json.loads)
            | 'process' >> beam.ParDo(ProcessPubSubMessage()))

        # Streaming analytic here. Show this second
        aggregateRes = (
            output
            | 'PairWIthOne' >> beam.Map(lambda row: (
                row.user,
                1,
            ))
            | beam.CombinePerKey(sum)
            | 'Aggregate Per Window' >> beam.Map(
                lambda row, window=beam.DoFn.WindowParam: {
                    'event':
                    row[0],
                    'event_count':
                    row[1],
                    'window_batch':
                    window.start.to_utc_datetime().strftime("%H:%m:%S") + '-' +
                    window.end.to_utc_datetime().strftime("%H:%m:%S")
                })
            | 'Convert To Pair KeyValue' >>
            beam.Map(lambda x: (x['window_batch'],
                                beam.Row(event=str(x['event']),
                                         event_count=int(x['event_count']))))
            | 'Aggregate per Batch' >> beam.GroupByKey())
        aggregateRes | 'Print' >> beam.ParDo(PrettyPrintMessage())

        # Write to BigQuery.
        bqOutput = output | 'ConvertToDictionary' >> beam.Map(
            lambda x: {
                'user': x.user,
                'action': x.action,
                'created_at': x.created_at
            })
        bqOutput | WriteToBigQuery(
            known_args.output_table,
            schema={
                'fields': [{
                    "mode": "NULLABLE",
                    "name": "created_at",
                    "type": "TIMESTAMP"
                }, {
                    "mode": "NULLABLE",
                    "name": "user",
                    "type": "STRING"
                }, {
                    "mode": "NULLABLE",
                    "name": "action",
                    "type": "STRING"
                }]
            },
            write_disposition=BigQueryDisposition.WRITE_APPEND)
Example #12
0
 def test_after_processing_time(self):
     self._test(AfterProcessingTime(), 0, DataLossReason.MAY_FINISH)
Example #13
0
        return [(Store_id, Store_location, Product_id, Product_category,
                 sold_unit, buy_rate, sell_price, profit, transaction_date)]


#############Create Pipeline ###########
stream_data = (
    p
    | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern)
    |
    'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip())
    | 'Split Data ' >> beam.Map(lambda row: row.decode().split(','))
    | 'Calculate Profit' >> beam.Map(calculateProfit)
    | 'Apply custom timestamp' >> beam.Map(custom_timestamp)
    | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1]))
    | 'Set Fixed Window of 30 sec' >> beam.WindowInto(
        window.FixedWindows(30),
        trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))),
        accumulation_mode=AccumulationMode.DISCARDING)
    | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum)
    | 'Format result and append time' >> beam.ParDo(BuildRecordFn())
    | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict)
    #|'Write to Text'>>beam.io.WriteToText(outputs_prefix)
    | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        table='sales', dataset='beam', project='beam-290211'))

p.run().wait_until_finish()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()
Example #14
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', required=True, help=('Output' 'op.csv'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        messages = (
            p
            | beam.io.ReadFromPubSub(subscription=known_args.input_subscription
                                     ).with_output_types(bytes))
    else:
        messages = (p
                    | beam.io.ReadFromPubSub(
                        topic=known_args.input_topic).with_output_types(bytes))

    lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))

        #DISCARDING
        #| beam.WindowInto(window.SlidingWindows(30, 1))
        | 'window' >> beam.WindowInto(window.FixedWindows(30),
                                      trigger=AfterProcessingTime(20))
        #,trigger=AfterProcessingTime(20),accumulation_mode=AccumulationMode.DISCARDING)
        #| 'window' >> beam.WindowInto(window.GlobalWindows(),trigger=Repeatedly(AfterCount(3)),accumulation_mode=AccumulationMode.ACCUMULATING)
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones)
        | 'format' >> beam.Map(format_result))

    counts | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()
Example #15
0
 def test_afer_all_all_may_finish(self):
     self._test(AfterAll(AfterCount(42), AfterProcessingTime(42)), 0,
                DataLossReason.MAY_FINISH)
Example #16
0
 def test_after_watermark_may_finish_late(self):
     self._test(AfterWatermark(late=AfterProcessingTime()), 60,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #17
0
 def test_after_any_different_reasons(self):
     self._test(
         AfterAny(Repeatedly(AfterCount(2)),
                  AfterProcessingTime()), 0, DataLossReason.MAY_FINISH
         | DataLossReason.CONDITION_NOT_GUARANTEED)
Example #18
0
 def test_after_any_same_reason(self):
     self._test(AfterAny(AfterCount(1), AfterProcessingTime()), 0,
                DataLossReason.MAY_FINISH)