Example #1
0
  def test_model_other_composite_triggers(self):
    pipeline_options = PipelineOptions()
    pipeline_options.view_as(StandardOptions).streaming = True

    with TestPipeline(options=pipeline_options) as p:
      test_stream = (
          TestStream().advance_watermark_to(10).add_elements(
              ['a', 'a']).add_elements(
                  ['a', 'b',
                   'b']).advance_processing_time(60).add_elements(['a'] * 100))
      pcollection = (
          p
          | test_stream
          | 'pair_with_one' >> beam.Map(lambda x: (x, 1)))

      counts = (
          # [START model_other_composite_triggers]
          pcollection | WindowInto(
              FixedWindows(1 * 60),
              trigger=Repeatedly(
                  AfterAny(AfterCount(100), AfterProcessingTime(1 * 60))),
              accumulation_mode=AccumulationMode.DISCARDING)
          # [END model_other_composite_triggers]
          | 'group' >> beam.GroupByKey()
          | 'count' >>
          beam.Map(lambda word_ones: (word_ones[0], sum(word_ones[1]))))
      assert_that(counts, equal_to([('a', 3), ('b', 2), ('a', 100)]))
    def test_fixed_after_count_accumulating(self):
        # yapf: disable
        test_stream = (
            TestStream()
              .advance_watermark_to(0)
              .add_elements([('k1', 1), ('k1', 1), ('k2', 1), ('k2', 1)])
              .add_elements([('k1', 1), ('k1', 1)])
              .advance_watermark_to(2)
              .add_elements([('k1', 2), ('k2', 2)])  # This values are discarded.
              .advance_watermark_to_infinity())
        # yapf: enable

        # Fixed, one-second windows with DefaultTrigger (after watermark)
        windowing = Windowing(FixedWindows(2),
                              triggerfn=Repeatedly(AfterCount(2)),
                              accumulation_mode=AccumulationMode.ACCUMULATING)

        with TestPipeline() as p:
            result = (
                p
                | test_stream
                | WindowInto(windowing.windowfn)
                | ParDo(trigger_manager._ReifyWindows())
                | ParDo(trigger_manager._GroupBundlesByKey())
                | ParDo(trigger_manager.GeneralTriggerManagerDoFn(windowing))
                |
                Map(lambda elm:
                    (elm[0], elm[1][0].windows[0], [v.value for v in elm[1]])))
            assert_that(
                result,
                equal_to([
                    ('k1', IntervalWindow(0, 2), [1, 1]),
                    ('k2', IntervalWindow(0, 2), [1, 1]),
                    ('k1', IntervalWindow(0, 2), [1, 1, 1, 1]),
                ]))
Example #3
0
 def test_sessions_repeatedly_after_count(self):
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.ACCUMULATING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('abcde')]},
         1,
         3)
     self.run_trigger_simple(
         Sessions(10),  # pyformat break
         Repeatedly(AfterCount(2)),
         AccumulationMode.DISCARDING,
         [(1, 'a'), (15, 'b'), (6, 'c'), (2, 'd'), (7, 'e')],
         {IntervalWindow(1, 25): [set('abc'), set('de')]},
         1,
         3)
Example #4
0
    def expand(self, pcoll):
        windowing_saved = pcoll.windowing
        if windowing_saved.is_default():
            # In this (common) case we can use a trivial trigger driver
            # and avoid the (expensive) window param.
            globally_windowed = window.GlobalWindows.windowed_value(None)
            MIN_TIMESTAMP = window.MIN_TIMESTAMP

            def reify_timestamps(element, timestamp=DoFn.TimestampParam):
                key, value = element
                if timestamp == MIN_TIMESTAMP:
                    timestamp = None
                return key, (value, timestamp)

            def restore_timestamps(element):
                key, values = element
                return [
                    globally_windowed.with_value((key, value)) if
                    timestamp is None else window.GlobalWindows.windowed_value(
                        (key, value), timestamp)
                    for (value, timestamp) in values
                ]

        else:

            def reify_timestamps(element,
                                 timestamp=DoFn.TimestampParam,
                                 window=DoFn.WindowParam):
                key, value = element
                # Transport the window as part of the value and restore it later.
                return key, windowed_value.WindowedValue(
                    value, timestamp, [window])

            def restore_timestamps(element):
                key, windowed_values = element
                return [
                    wv.with_value((key, wv.value)) for wv in windowed_values
                ]

        ungrouped = pcoll | Map(reify_timestamps).with_output_types(Any)

        # TODO(BEAM-8104) Using global window as one of the standard window.
        # This is to mitigate the Dataflow Java Runner Harness limitation to
        # accept only standard coders.
        ungrouped._windowing = Windowing(
            window.GlobalWindows(),
            triggerfn=Repeatedly(AfterCount(1)),
            accumulation_mode=AccumulationMode.DISCARDING,
            timestamp_combiner=TimestampCombiner.OUTPUT_AT_EARLIEST)
        result = (ungrouped
                  | GroupByKey()
                  | FlatMap(restore_timestamps).with_output_types(Any))
        result._windowing = windowing_saved
        return result
Example #5
0
 def test_trigger_encoding(self):
   for trigger_fn in (DefaultTrigger(),
                      AfterAll(AfterCount(1), AfterCount(10)),
                      AfterAny(AfterCount(10), AfterCount(100)),
                      AfterWatermark(early=AfterCount(1000)),
                      AfterWatermark(early=AfterCount(1000),
                                     late=AfterCount(1)),
                      Repeatedly(AfterCount(100)),
                      trigger.OrFinally(AfterCount(3), AfterCount(10))):
     context = pipeline_context.PipelineContext()
     self.assertEqual(
         trigger_fn,
         TriggerFn.from_runner_api(trigger_fn.to_runner_api(context), context))
Example #6
0
 def test_repeatedly_after_first(self):
   self.run_trigger_simple(
       FixedWindows(100),  # pyformat break
       Repeatedly(AfterAny(AfterCount(3), AfterWatermark())),
       AccumulationMode.ACCUMULATING,
       zip(range(7), 'abcdefg'),
       {IntervalWindow(0, 100): [
           set('abc'),
           set('abcdef'),
           set('abcdefg'),
           set('abcdefgx'),
           set('abcdefgxy'),
           set('abcdefgxyz')]},
       1,
       late_data=zip(range(3), 'xyz'))
Example #7
0
  def test_sessions_after_each(self):
    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        AfterEach(AfterCount(2), AfterCount(3)),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')]},
        2)

    self.run_trigger_simple(
        Sessions(10),  # pyformat break
        Repeatedly(AfterEach(AfterCount(2), AfterCount(3))),
        AccumulationMode.ACCUMULATING,
        zip(range(10), 'abcdefghij'),
        {IntervalWindow(0, 11): [set('ab')],
         IntervalWindow(0, 15): [set('abcdef')],
         IntervalWindow(0, 17): [set('abcdefgh')]},
        2)
def run(argv=None):
  """Build and run the pipeline."""
  args = ["--runner=PortableRunner",
          "--job_endpoint=localhost:8099",
          "--streaming"]
  if argv:
    args.extend(argv)

  parser = argparse.ArgumentParser()
  parser.add_argument('--count',
                      dest='count',
                      default=0,
                      help='Number of triggers to generate '
                           '(0 means emit forever).')
  parser.add_argument('--interval_ms',
                      dest='interval_ms',
                      default=500,
                      help='Interval between records per parallel '
                           'Flink subtask.')

  known_args, pipeline_args = parser.parse_known_args(args)

  pipeline_options = PipelineOptions(pipeline_args)

  p = beam.Pipeline(options=pipeline_options)

  messages = (p | FlinkStreamingImpulseSource()
              .set_message_count(known_args.count)
              .set_interval_ms(known_args.interval_ms))

  _ = (messages | 'decode' >> beam.Map(lambda x: ('', 1))
       | 'window' >> beam.WindowInto(window.GlobalWindows(),
                                     trigger=Repeatedly(
                                         AfterProcessingTime(5 * 1000)),
                                     accumulation_mode=
                                     AccumulationMode.DISCARDING)
       | 'group' >> beam.GroupByKey()
       | 'count' >> beam.Map(count)
       | 'log' >> beam.Map(lambda x: logging.info("%d" % x[1])))

  result = p.run()
  result.wait_until_finish()
Example #9
0
  def test_buffering_timer_in_global_window_streaming(self):
    max_buffering_duration_secs = 42

    start_time = timestamp.Timestamp(0)
    test_stream = TestStream().advance_watermark_to(start_time)
    for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
      test_stream.add_elements(
          [TimestampedValue(value, start_time + i)]) \
        .advance_processing_time(5)
    test_stream.advance_watermark_to(
        start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
      .advance_watermark_to_infinity()

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # Set a batch size larger than the total number of elements.
      # Since we're in a global window, we would have been waiting
      # for all the elements to arrive without the buffering time limit.
      batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2

      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0). Since the fake clock never really advances during
      # the pipeline execution, meaning that the timer is always set to the same
      # value, the timer will be fired on every element after the first firing.
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | WindowInto(
              GlobalWindows(),
              trigger=Repeatedly(AfterCount(1)),
              accumulation_mode=trigger.AccumulationMode.DISCARDING)
          | util.GroupIntoBatches(
              batch_size, max_buffering_duration_secs, fake_clock)
          | 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # We will flush twice when the max buffering duration is reached and when
      # the global window ends.
      assert_that(num_elements_per_batch, equal_to([9, 1]))
Example #10
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
Example #11
0
 def test_repeatedly_unsafe_underlying(self):
     self._test(Repeatedly(AfterCount(42)), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #12
0
 def test_repeatedly_condition_underlying(self):
     self._test(Repeatedly(AfterCount(2)), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #13
0
        return [(Store_id, Store_location, Product_id, Product_category,
                 sold_unit, buy_rate, sell_price, profit, transaction_date)]


#############Create Pipeline ###########
stream_data = (
    p
    | 'Read from PubSub' >> beam.io.ReadFromPubSub(subscription=inputs_pattern)
    |
    'Remove space in the Data ' >> beam.Map(lambda row: row.lstrip().rstrip())
    | 'Split Data ' >> beam.Map(lambda row: row.decode().split(','))
    | 'Calculate Profit' >> beam.Map(calculateProfit)
    | 'Apply custom timestamp' >> beam.Map(custom_timestamp)
    | 'Make Key value' >> beam.Map(lambda row: (row[:-2], row[-1]))
    | 'Set Fixed Window of 30 sec' >> beam.WindowInto(
        window.FixedWindows(30),
        trigger=Repeatedly(AfterAny(AfterCount(5), AfterProcessingTime(10))),
        accumulation_mode=AccumulationMode.DISCARDING)
    | 'Combine Result of 30 Sec' >> beam.CombinePerKey(sum)
    | 'Format result and append time' >> beam.ParDo(BuildRecordFn())
    | 'Prepare data for BigQuery' >> beam.Map(covert_to_dict)
    #|'Write to Text'>>beam.io.WriteToText(outputs_prefix)
    | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        table='sales', dataset='beam', project='beam-290211'))

p.run().wait_until_finish()

if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    run()
Example #14
0
 def test_after_any_all_safe(self):
     self._test(AfterAny(Repeatedly(AfterCount(42)), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #15
0
 def test_after_each_all_safe(self):
     self._test(AfterEach(Repeatedly(AfterCount(1)), DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #16
0
 def test_after_any_different_reasons(self):
     self._test(
         AfterAny(Repeatedly(AfterCount(2)),
                  AfterProcessingTime()), 0, DataLossReason.MAY_FINISH
         | DataLossReason.CONDITION_NOT_GUARANTEED)
Example #17
0
 def test_repeatedly_condition_underlying(self):
     self._test(Repeatedly(AfterCount(2)), 0,
                DataLossReason.CONDITION_NOT_GUARANTEED)
Example #18
0
 def test_repeatedly_may_finish_underlying(self):
     self._test(Repeatedly(AfterCount(1)), 0,
                DataLossReason.NO_POTENTIAL_LOSS)
Example #19
0
 def test_repeatedly_safe_underlying(self):
     self._test(Repeatedly(DefaultTrigger()), 0,
                DataLossReason.NO_POTENTIAL_LOSS)