Beispiel #1
0
 def test_reshuffle_streaming_global_window(self):
     options = PipelineOptions()
     options.view_as(StandardOptions).streaming = True
     with TestPipeline(options=options) as pipeline:
         data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
         expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
         before_reshuffle = (pipeline
                             | beam.Create(data)
                             | beam.WindowInto(GlobalWindows())
                             | beam.GroupByKey()
                             | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
         assert_that(before_reshuffle,
                     equal_to(expected_data),
                     label='before_reshuffle')
         after_reshuffle = before_reshuffle | beam.Reshuffle()
         assert_that(after_reshuffle,
                     equal_to(expected_data),
                     label='after reshuffle')
Beispiel #2
0
  def test_buffering_timer_in_fixed_window_streaming(self):
    window_duration = 6
    max_buffering_duration_secs = 100

    start_time = timestamp.Timestamp(0)
    test_stream = (
        TestStream().add_elements([
            TimestampedValue(value, start_time + i) for i,
            value in enumerate(GroupIntoBatchesTest._create_test_data())
        ]).advance_processing_time(150).advance_watermark_to(
            start_time + window_duration).advance_watermark_to(
                start_time + window_duration +
                1).advance_watermark_to_infinity())

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0).
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | "fixed window" >> WindowInto(FixedWindows(window_duration))
          | util.GroupIntoBatches(
              GroupIntoBatchesTest.BATCH_SIZE,
              max_buffering_duration_secs,
              fake_clock)
          | "count elements in batch" >> Map(lambda x: (None, len(x[1])))
          | "global window" >> WindowInto(GlobalWindows())
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # Window duration is 6 and batch size is 5, so output batch size
      # should be 5 (flush because of batch size reached).
      expected_0 = 5
      # There is only one element left in the window so batch size
      # should be 1 (flush because of max buffering duration reached).
      expected_1 = 1
      # Collection has 10 elements, there are only 4 left, so batch size should
      # be 4 (flush because of end of window reached).
      expected_2 = 4
      assert_that(
          num_elements_per_batch,
          equal_to([expected_0, expected_1, expected_2]),
          "assert2")
Beispiel #3
0
def create_trigger_driver(
    windowing, is_batch=False, phased_combine_fn=None, clock=None):
  """Create the TriggerDriver for the given windowing and options."""

  # TODO(robertwb): We can do more if we know elements are in timestamp
  # sorted order.
  if windowing.is_default() and is_batch:
    driver = BatchGlobalTriggerDriver()
  elif (windowing.windowfn == GlobalWindows() and
        (windowing.triggerfn in [AfterCount(1), Always()]) and is_batch):
    # Here we also just pass through all the values exactly once.
    driver = BatchGlobalTriggerDriver()
  else:
    driver = GeneralTriggerDriver(windowing, clock)

  if phased_combine_fn:
    # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using
    # the known phased_combine_fn here.
    driver = CombiningTriggerDriver(phased_combine_fn, driver)
  return driver
Beispiel #4
0
  def test_buffering_timer_in_global_window_streaming(self):
    max_buffering_duration_secs = 42

    start_time = timestamp.Timestamp(0)
    test_stream = TestStream().advance_watermark_to(start_time)
    for i, value in enumerate(GroupIntoBatchesTest._create_test_data()):
      test_stream.add_elements(
          [TimestampedValue(value, start_time + i)]) \
        .advance_processing_time(5)
    test_stream.advance_watermark_to(
        start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \
      .advance_watermark_to_infinity()

    with TestPipeline(options=StandardOptions(streaming=True)) as pipeline:
      # Set a batch size larger than the total number of elements.
      # Since we're in a global window, we would have been waiting
      # for all the elements to arrive without the buffering time limit.
      batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2

      # To trigger the processing time timer, use a fake clock with start time
      # being Timestamp(0). Since the fake clock never really advances during
      # the pipeline execution, meaning that the timer is always set to the same
      # value, the timer will be fired on every element after the first firing.
      fake_clock = FakeClock(now=start_time)

      num_elements_per_batch = (
          pipeline | test_stream
          | WindowInto(
              GlobalWindows(),
              trigger=Repeatedly(AfterCount(1)),
              accumulation_mode=trigger.AccumulationMode.DISCARDING)
          | util.GroupIntoBatches(
              batch_size, max_buffering_duration_secs, fake_clock)
          | 'count elements in batch' >> Map(lambda x: (None, len(x[1])))
          | GroupByKey()
          | FlatMapTuple(lambda k, vs: vs))

      # We will flush twice when the max buffering duration is reached and when
      # the global window ends.
      assert_that(num_elements_per_batch, equal_to([9, 1]))
Beispiel #5
0
def create_trigger_driver(windowing,
                          is_batch=False,
                          phased_combine_fn=None,
                          clock=None):
    """Create the TriggerDriver for the given windowing and options."""

    # TODO(robertwb): We can do more if we know elements are in timestamp
    # sorted order.
    if windowing.is_default() and is_batch:
        driver = DiscardingGlobalTriggerDriver()
    elif (windowing.windowfn == GlobalWindows()
          and windowing.triggerfn == AfterCount(1)
          and windowing.accumulation_mode == AccumulationMode.DISCARDING):
        # Here we also just pass through all the values every time.
        driver = DiscardingGlobalTriggerDriver()
    else:
        driver = GeneralTriggerDriver(windowing, clock)

    if phased_combine_fn:
        # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using
        # the known phased_combine_fn here.
        driver = CombiningTriggerDriver(phased_combine_fn, driver)
    return driver
Beispiel #6
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()
Beispiel #7
0
    def test_combining_with_accumulation_mode_and_fanout(self):
        # PCollection will contain elements from 1 to 5.
        elements = [i for i in range(1, 6)]

        ts = TestStream().advance_watermark_to(0)
        for i in elements:
            ts.add_elements([i])
        ts.advance_watermark_to_infinity()

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            result = (
                p
                | ts
                | beam.WindowInto(
                    GlobalWindows(),
                    accumulation_mode=trigger.AccumulationMode.ACCUMULATING,
                    trigger=AfterWatermark(early=AfterAll(AfterCount(1))))
                | beam.CombineGlobally(sum).without_defaults().with_fanout(2))

            # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0].
            firings = [1, 3, 6, 10, 15, 15, 15]
            assert_that(result, equal_to(firings))
 def get_windowing(self, inputs):
     return Windowing(GlobalWindows())
Beispiel #9
0
  def process(self, source):
    if isinstance(source, iobase.SourceBundle):
      for value in source.source.read(source.source.get_range_tracker(
          source.start_position, source.stop_position)):
        yield value
    else:
      # Dataflow native source
      with source.reader() as reader:
        for value in reader:
          yield value


# See DataflowRunner._pardo_fn_data
OLDE_SOURCE_SPLITTABLE_DOFN_DATA = pickler.dumps(
    (OldeSourceSplittableDoFn(), (), {}, [],
     beam.transforms.core.Windowing(GlobalWindows())))


class _GroupingBuffer(object):
  """Used to accumulate groupded (shuffled) results."""
  def __init__(self, pre_grouped_coder, post_grouped_coder, windowing):
    self._key_coder = pre_grouped_coder.key_coder()
    self._pre_grouped_coder = pre_grouped_coder
    self._post_grouped_coder = post_grouped_coder
    self._table = collections.defaultdict(list)
    self._windowing = windowing

  def append(self, elements_data):
    input_stream = create_InputStream(elements_data)
    while input_stream.size() > 0:
      windowed_key_value = self._pre_grouped_coder.get_impl(
Beispiel #10
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """
        Main execution logic for the Sequencer component

        :param input_dict: input channels
        :param output_dict: output channels
        :param exec_properties: the execution properties defined in the spec
        """

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)

        # Get the schema
        schema_path = io_utils.get_only_uri_in_dir(
            artifact_utils.get_single_uri(input_dict[constants.SCHEMA]))
        schema = io_utils.SchemaReader().read(schema_path)

        # TODO: Getting the statistics might help the future implementations

        sequence_step: BaseSequencerStep = c(schema=schema,
                                             statistics=None,
                                             **args)

        # Get split names
        input_artifact = artifact_utils.get_single_instance(
            input_dict[constants.INPUT_EXAMPLES])
        split_names = artifact_utils.decode_split_names(
            input_artifact.split_names)

        # Create output artifact
        output_artifact = artifact_utils.get_single_instance(
            output_dict[constants.OUTPUT_EXAMPLES])
        output_artifact.split_names = artifact_utils.encode_split_names(
            split_names)

        with self._make_beam_pipeline() as p:
            for s in split_names:
                input_uri = io_utils.all_files_pattern(
                    artifact_utils.get_split_uri(
                        input_dict[constants.INPUT_EXAMPLES], s))

                output_uri = artifact_utils.get_split_uri(
                    output_dict[constants.OUTPUT_EXAMPLES], s)
                output_path = os.path.join(output_uri, self._DEFAULT_FILENAME)

                # Read and decode the data
                data = \
                    (p
                     | 'Read_' + s >> beam.io.ReadFromTFRecord(
                                file_pattern=input_uri)
                     | 'Decode_' + s >> tf_example_decoder.DecodeTFExample()
                     | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe()))

                # Window into sessions
                s_data = \
                    (data
                     | 'AddCategory_' + s >> beam.ParDo(
                                sequence_step.get_category_do_fn())
                     | 'AddTimestamp_' + s >> beam.ParDo(
                                sequence_step.get_timestamp_do_fn())
                     | 'Sessions_' + s >> beam.WindowInto(
                                sequence_step.get_window()))

                # Combine and transform
                p_data = \
                    (s_data
                     | 'Combine_' + s >> beam.CombinePerKey(
                                sequence_step.get_combine_fn()))

                # Write the results
                _ = \
                    (p_data
                     | 'Global_' + s >> beam.WindowInto(GlobalWindows())
                     | 'RemoveKey_' + s >> beam.ParDo(RemoveKey())
                     | 'ToExample_' + s >> beam.Map(utils.df_to_example)
                     | 'Serialize_' + s >> beam.Map(utils.serialize)
                     | 'Write_' + s >> beam.io.WriteToTFRecord(
                                output_path,
                                file_name_suffix='.gz'))
Beispiel #11
0
 def finish_bundle(self):
     yield WindowedValue(list(self.all_columns),
                         timestamp=0,
                         windows=[GlobalWindows()])
Beispiel #12
0
 def _test(self, trigger, lateness, expected):
     windowing = WindowInto(GlobalWindows(),
                            trigger=trigger,
                            accumulation_mode=AccumulationMode.ACCUMULATING,
                            allowed_lateness=lateness).windowing
     self.assertEqual(trigger.may_lose_data(windowing), expected)