def test_reshuffle_streaming_global_window(self): options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as pipeline: data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)] expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] before_reshuffle = (pipeline | beam.Create(data) | beam.WindowInto(GlobalWindows()) | beam.GroupByKey() | beam.MapTuple(lambda k, vs: (k, sorted(vs)))) assert_that(before_reshuffle, equal_to(expected_data), label='before_reshuffle') after_reshuffle = before_reshuffle | beam.Reshuffle() assert_that(after_reshuffle, equal_to(expected_data), label='after reshuffle')
def test_buffering_timer_in_fixed_window_streaming(self): window_duration = 6 max_buffering_duration_secs = 100 start_time = timestamp.Timestamp(0) test_stream = ( TestStream().add_elements([ TimestampedValue(value, start_time + i) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()) ]).advance_processing_time(150).advance_watermark_to( start_time + window_duration).advance_watermark_to( start_time + window_duration + 1).advance_watermark_to_infinity()) with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | "fixed window" >> WindowInto(FixedWindows(window_duration)) | util.GroupIntoBatches( GroupIntoBatchesTest.BATCH_SIZE, max_buffering_duration_secs, fake_clock) | "count elements in batch" >> Map(lambda x: (None, len(x[1]))) | "global window" >> WindowInto(GlobalWindows()) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # Window duration is 6 and batch size is 5, so output batch size # should be 5 (flush because of batch size reached). expected_0 = 5 # There is only one element left in the window so batch size # should be 1 (flush because of max buffering duration reached). expected_1 = 1 # Collection has 10 elements, there are only 4 left, so batch size should # be 4 (flush because of end of window reached). expected_2 = 4 assert_that( num_elements_per_batch, equal_to([expected_0, expected_1, expected_2]), "assert2")
def create_trigger_driver( windowing, is_batch=False, phased_combine_fn=None, clock=None): """Create the TriggerDriver for the given windowing and options.""" # TODO(robertwb): We can do more if we know elements are in timestamp # sorted order. if windowing.is_default() and is_batch: driver = BatchGlobalTriggerDriver() elif (windowing.windowfn == GlobalWindows() and (windowing.triggerfn in [AfterCount(1), Always()]) and is_batch): # Here we also just pass through all the values exactly once. driver = BatchGlobalTriggerDriver() else: driver = GeneralTriggerDriver(windowing, clock) if phased_combine_fn: # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using # the known phased_combine_fn here. driver = CombiningTriggerDriver(phased_combine_fn, driver) return driver
def test_buffering_timer_in_global_window_streaming(self): max_buffering_duration_secs = 42 start_time = timestamp.Timestamp(0) test_stream = TestStream().advance_watermark_to(start_time) for i, value in enumerate(GroupIntoBatchesTest._create_test_data()): test_stream.add_elements( [TimestampedValue(value, start_time + i)]) \ .advance_processing_time(5) test_stream.advance_watermark_to( start_time + GroupIntoBatchesTest.NUM_ELEMENTS + 1) \ .advance_watermark_to_infinity() with TestPipeline(options=StandardOptions(streaming=True)) as pipeline: # Set a batch size larger than the total number of elements. # Since we're in a global window, we would have been waiting # for all the elements to arrive without the buffering time limit. batch_size = GroupIntoBatchesTest.NUM_ELEMENTS * 2 # To trigger the processing time timer, use a fake clock with start time # being Timestamp(0). Since the fake clock never really advances during # the pipeline execution, meaning that the timer is always set to the same # value, the timer will be fired on every element after the first firing. fake_clock = FakeClock(now=start_time) num_elements_per_batch = ( pipeline | test_stream | WindowInto( GlobalWindows(), trigger=Repeatedly(AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | util.GroupIntoBatches( batch_size, max_buffering_duration_secs, fake_clock) | 'count elements in batch' >> Map(lambda x: (None, len(x[1]))) | GroupByKey() | FlatMapTuple(lambda k, vs: vs)) # We will flush twice when the max buffering duration is reached and when # the global window ends. assert_that(num_elements_per_batch, equal_to([9, 1]))
def create_trigger_driver(windowing, is_batch=False, phased_combine_fn=None, clock=None): """Create the TriggerDriver for the given windowing and options.""" # TODO(robertwb): We can do more if we know elements are in timestamp # sorted order. if windowing.is_default() and is_batch: driver = DiscardingGlobalTriggerDriver() elif (windowing.windowfn == GlobalWindows() and windowing.triggerfn == AfterCount(1) and windowing.accumulation_mode == AccumulationMode.DISCARDING): # Here we also just pass through all the values every time. driver = DiscardingGlobalTriggerDriver() else: driver = GeneralTriggerDriver(windowing, clock) if phased_combine_fn: # TODO(ccy): Refactor GeneralTriggerDriver to combine values eagerly using # the known phased_combine_fn here. driver = CombiningTriggerDriver(phased_combine_fn, driver) return driver
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def test_combining_with_accumulation_mode_and_fanout(self): # PCollection will contain elements from 1 to 5. elements = [i for i in range(1, 6)] ts = TestStream().advance_watermark_to(0) for i in elements: ts.add_elements([i]) ts.advance_watermark_to_infinity() options = PipelineOptions() options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: result = ( p | ts | beam.WindowInto( GlobalWindows(), accumulation_mode=trigger.AccumulationMode.ACCUMULATING, trigger=AfterWatermark(early=AfterAll(AfterCount(1)))) | beam.CombineGlobally(sum).without_defaults().with_fanout(2)) # The frings for DISCARDING mode is [1, 2, 3, 4, 5, 0, 0]. firings = [1, 3, 6, 10, 15, 15, 15] assert_that(result, equal_to(firings))
def get_windowing(self, inputs): return Windowing(GlobalWindows())
def process(self, source): if isinstance(source, iobase.SourceBundle): for value in source.source.read(source.source.get_range_tracker( source.start_position, source.stop_position)): yield value else: # Dataflow native source with source.reader() as reader: for value in reader: yield value # See DataflowRunner._pardo_fn_data OLDE_SOURCE_SPLITTABLE_DOFN_DATA = pickler.dumps( (OldeSourceSplittableDoFn(), (), {}, [], beam.transforms.core.Windowing(GlobalWindows()))) class _GroupingBuffer(object): """Used to accumulate groupded (shuffled) results.""" def __init__(self, pre_grouped_coder, post_grouped_coder, windowing): self._key_coder = pre_grouped_coder.key_coder() self._pre_grouped_coder = pre_grouped_coder self._post_grouped_coder = post_grouped_coder self._table = collections.defaultdict(list) self._windowing = windowing def append(self, elements_data): input_stream = create_InputStream(elements_data) while input_stream.size() > 0: windowed_key_value = self._pre_grouped_coder.get_impl(
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """ Main execution logic for the Sequencer component :param input_dict: input channels :param output_dict: output channels :param exec_properties: the execution properties defined in the spec """ source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) # Get the schema schema_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[constants.SCHEMA])) schema = io_utils.SchemaReader().read(schema_path) # TODO: Getting the statistics might help the future implementations sequence_step: BaseSequencerStep = c(schema=schema, statistics=None, **args) # Get split names input_artifact = artifact_utils.get_single_instance( input_dict[constants.INPUT_EXAMPLES]) split_names = artifact_utils.decode_split_names( input_artifact.split_names) # Create output artifact output_artifact = artifact_utils.get_single_instance( output_dict[constants.OUTPUT_EXAMPLES]) output_artifact.split_names = artifact_utils.encode_split_names( split_names) with self._make_beam_pipeline() as p: for s in split_names: input_uri = io_utils.all_files_pattern( artifact_utils.get_split_uri( input_dict[constants.INPUT_EXAMPLES], s)) output_uri = artifact_utils.get_split_uri( output_dict[constants.OUTPUT_EXAMPLES], s) output_path = os.path.join(output_uri, self._DEFAULT_FILENAME) # Read and decode the data data = \ (p | 'Read_' + s >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | 'Decode_' + s >> tf_example_decoder.DecodeTFExample() | 'ToDataFrame_' + s >> beam.ParDo(utils.ConvertToDataframe())) # Window into sessions s_data = \ (data | 'AddCategory_' + s >> beam.ParDo( sequence_step.get_category_do_fn()) | 'AddTimestamp_' + s >> beam.ParDo( sequence_step.get_timestamp_do_fn()) | 'Sessions_' + s >> beam.WindowInto( sequence_step.get_window())) # Combine and transform p_data = \ (s_data | 'Combine_' + s >> beam.CombinePerKey( sequence_step.get_combine_fn())) # Write the results _ = \ (p_data | 'Global_' + s >> beam.WindowInto(GlobalWindows()) | 'RemoveKey_' + s >> beam.ParDo(RemoveKey()) | 'ToExample_' + s >> beam.Map(utils.df_to_example) | 'Serialize_' + s >> beam.Map(utils.serialize) | 'Write_' + s >> beam.io.WriteToTFRecord( output_path, file_name_suffix='.gz'))
def finish_bundle(self): yield WindowedValue(list(self.all_columns), timestamp=0, windows=[GlobalWindows()])
def _test(self, trigger, lateness, expected): windowing = WindowInto(GlobalWindows(), trigger=trigger, accumulation_mode=AccumulationMode.ACCUMULATING, allowed_lateness=lateness).windowing self.assertEqual(trigger.may_lose_data(windowing), expected)