Beispiel #1
0
  def test_model_other_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements(
              ['a', 'a']).add_elements(
                  ['a', 'b',
                   'b']).advance_processing_time(60).add_elements(['a'] * 100))
      pcollection = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_other_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=Repeatedly(
                  AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_other_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
Beispiel #2
0
 def test_fixed_after_first(self):
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('ab')]},
         1,
         2)
     self.run_trigger_simple(
         FixedWindows(10),  # pyformat break
         AfterAny(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(0, 10): [set('abc')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
Beispiel #3
0
 def test_trigger_encoding(self):
   for trigger_fn in (DefaultTrigger(),
                      AfterAll(AfterCount(1), AfterCount(10)),
                      AfterAny(AfterCount(10), AfterCount(100)),
                      AfterWatermark(early=AfterCount(1000)),
                      AfterWatermark(early=AfterCount(1000),
                                     late=AfterCount(1)),
                      Repeatedly(AfterCount(100)),
                      trigger.OrFinally(AfterCount(3), AfterCount(10))):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         trigger_fn,
         TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
Beispiel #4
0
 def test_repeatedly_after_first(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
       AccumulationMode.ACCUMULATING,
       zip(range(7), 'abcdefg'),
       {IntervalWindow(0, 100): [
           set('abc'),
           set('abcdef'),
           set('abcdefg'),
           set('abcdefgx'),
           set('abcdefgxy'),
           set('abcdefgxyz')]},
       1,
       late_data=zip(range(3), 'xyz'))
Beispiel #5
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
Beispiel #6
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
Beispiel #7
0
        return [(Store_id, Store_location, Product_id, Product_category,
                 sold_unit, buy_rate, sell_price, profit, transaction_date)]


#############Create Pipeline ###########
stream_data = (
    p
    | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern)
    |
    'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip())
    | 'Split Data ' >> beam.Map(lambda row: row.decode().split(','))
    | 'Calculate Profit' >> beam.Map(calculateProfit)
    | 'Apply custom timestamp' >> beam.Map(custom_timestamp)
    | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1]))
    | 'Set Fixed Window of 30 sec' >> beam.WindowInto(
        window.FixedWindows(30),
        trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))),
        accumulation_mode=AccumulationMode.DISCARDING)
    | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum)
    | 'Format result and append time' >> beam.ParDo(BuildRecordFn())
    | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict)
    #|'Write to Text'>>beam.io.WriteToText(outputs_prefix)
    | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        table='sales', dataset='beam', project='beam-290211'))

p.run().wait_until_finish()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()
Beispiel #8
0
 def test_after_any_different_reasons(self):
     self._test(
         AfterAny(Repeatedly(AfterCount(2)),
                  AfterProcessingTime()), 0, DataLossReason.MAY_FINISH
         | DataLossReason.CONDITION_NOT_GUARANTEED)
Beispiel #9
0
 def test_after_any_same_reason(self):
     self._test(AfterAny(AfterCount(1), AfterProcessingTime()), 0,
                DataLossReason.MAY_FINISH)
Beispiel #10
0
 def test_after_any_some_unsafe(self):
     self._test(AfterAny(AfterCount(1), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Beispiel #11
0
 def test_after_any_all_safe(self):
     self._test(AfterAny(Repeatedly(AfterCount(42)), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Beispiel #12
0
 def test_after_any_one_may_finish(self):
     self._test(AfterAny(AfterCount(42), DefaultTrigger()), 0,
                DataLossReason.MAY_FINISH)