예제 #1
0
 def expand(self, pcoll):
     return (
         pcoll
         # Bind window info to each element using element timestamp (or publish time).
         | "Window into fixed intervals" >> WindowInto(
             FixedWindows(self.window_size))
         | "Add timestamp to windowed elements" >> ParDo(AddTimestamp())
         # Assign a random key to each windowed element based on the number of shards.
         | "Add key" >>
         WithKeys(lambda _: random.randint(0, self.num_shards - 1))
         # Group windowed elements by key. All the elements in the same window must fit
         # memory for this. If not, you need to use `beam.util.BatchElements`.
         | "Group by key" >> GroupByKey())
예제 #2
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()
예제 #3
0
def main():
  options = PipelineOptions()
  options.view_as(SetupOptions).save_main_session = True

  BATCH_SIZE = 1000000
  BUFFERING_SECS = 600

  p = Pipeline(options=options)
  (p
   | Create(range(100), reshuffle=True)
   | ParDo(make_large_elements)  # 128 KiB
   | WithKeys('')
   | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS)  # Big batch size with 1 minute trigger
   | Map(lambda kv: logging.info('key: %s, value count: %s',
                                 kv[0], len(kv[1]))))

  run = p.run()
  run.wait_until_finish()
예제 #4
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    BATCH_SIZE = 1000000
    BUFFERING_SECS = 600

    p = Pipeline(options=options)
    (p
     | Create(range(100), reshuffle=True)
     | ParDo(make_large_elements)  # 128 KiB
     | WithKeys('')
     | WindowInto(GlobalWindows(),
                  trigger=Repeatedly(
                      AfterAny(AfterCount(BATCH_SIZE),
                               AfterProcessingTime(BUFFERING_SECS))),
                  accumulation_mode=AccumulationMode.DISCARDING)
     | GroupByKey()
     | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1]
                                                                          ))))

    run = p.run()
    run.wait_until_finish()