def expand(self, pcoll): return ( pcoll # Bind window info to each element using element timestamp (or publish time). | "Window into fixed intervals" >> WindowInto( FixedWindows(self.window_size)) | "Add timestamp to windowed elements" >> ParDo(AddTimestamp()) # Assign a random key to each windowed element based on the number of shards. | "Add key" >> WithKeys(lambda _: random.randint(0, self.num_shards - 1)) # Group windowed elements by key. All the elements in the same window must fit # memory for this. If not, you need to use `beam.util.BatchElements`. | "Group by key" >> GroupByKey())
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS) # Big batch size with 1 minute trigger | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1])))) run = p.run() run.wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()