コード例 #1
0
    def test_multiple_accumulating_firings(self):
        # PCollection will contain elements from 1 to 10.
        elements = [i for i in range(1, 11)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([('key', str(i))])
            if i % 5 == 0:
                ts.advance_watermark_to(i)
                ts.advance_processing_time(5)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            _ = (
                p
                | ts
                | beam.WindowInto(
                    FixedWindows(10),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(
                        early=AfterAll(AfterCount(1), AfterProcessingTime(5))))
                | beam.GroupByKey()
                | beam.FlatMap(lambda x: x[1])
                | beam.ParDo(self.record_dofn()))

        # The trigger should fire twice. Once after 5 seconds, and once after 10.
        # The firings should accumulate the output.
        first_firing = [str(i) for i in elements if i <= 5]
        second_firing = [str(i) for i in elements]
        self.assertListEqual(first_firing + second_firing,
                             TriggerPipelineTest.all_records)
コード例 #2
0
  def test_combining_with_accumulation_mode_and_fanout(self):
    # PCollection will contain elements from 1 to 5.
    elements = [i for i in range(1, 6)]

    ts = TestStream().advance_watermark_to(0)
    for i in elements:
      ts.add_elements([i])
    ts.advance_watermark_to_infinity()

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    with TestPipeline(options=options) as p:
      result = (
          p
          | ts
          | beam.WindowInto(
              GlobalWindows(),
              accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
              trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
          | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

      def has_expected_values(actual):
        from hamcrest.core import assert_that as hamcrest_assert
        from hamcrest.library.collection import contains
        from hamcrest.library.collection import only_contains
        ordered = sorted(actual)
        # Early firings.
        hamcrest_assert(ordered[:4], contains(1, 3, 6, 10))
        # Different runners have different number of 15s, but there should
        # be at least one 15.
        hamcrest_assert(ordered[4:], only_contains(15))

      assert_that(result, has_expected_values)
コード例 #3
0
 def test_sessions_after_all(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(2), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abc')]},
         1,
         2)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         AfterAll(AfterCount(5), AfterWatermark()),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (2, 'b'), (3, 'c')],
         {IntervalWindow(1, 13): [set('abcxy')]},
         1,
         2,
         late_data=[(1, 'x'), (2, 'y'), (3, 'z')])
コード例 #4
0
ファイル: trigger_test.py プロジェクト: twang126/beam-1
 def test_trigger_encoding(self):
   for trigger_fn in (DefaultTrigger(),
                      AfterAll(AfterCount(1), AfterCount(10)),
                      AfterAny(AfterCount(10), AfterCount(100)),
                      AfterWatermark(early=AfterCount(1000)),
                      AfterWatermark(early=AfterCount(1000),
                                     late=AfterCount(1)),
                      Repeatedly(AfterCount(100)),
                      trigger.OrFinally(AfterCount(3), AfterCount(10))):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         trigger_fn,
         TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
コード例 #5
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
コード例 #6
0
ファイル: combiners_test.py プロジェクト: ostrokach/beam
    def test_combining_with_accumulation_mode_and_fanout(self):
        # PCollection will contain elements from 1 to 5.
        elements = [i for i in range(1, 6)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([i])
        ts.advance_watermark_to_infinity()

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            result = (
                p
                | ts
                | beam.WindowInto(
                    GlobalWindows(),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
                | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

            # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0].
            firings = [1, 3, 6, 10, 15, 15, 15]
            assert_that(result, equal_to(firings))
コード例 #7
0
 def test_after_all_safe(self):
     self._test(AfterAll(Repeatedly(AfterCount(1)), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
コード例 #8
0
 def test_after_all_some_unsafe(self):
     self._test(AfterAll(AfterCount(1), DefaultTrigger()), 0,
                DataLossReason.MAY_FINISH)
コード例 #9
0
 def test_afer_all_all_may_finish(self):
     self._test(AfterAll(AfterCount(42), AfterProcessingTime(42)), 0,
                DataLossReason.MAY_FINISH)
コード例 #10
0
 def test_after_all_some_may_finish(self):
     self._test(AfterAll(AfterCount(1), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)