Ejemplo n.º 1
0
 def expand(self, pcoll):
     do_once = pcoll.pipeline | 'DoOnce' >> core.Create([None])
     init_result_coll = do_once | 'InitializeWrite' >> core.Map(
         lambda _, sink: sink.initialize_write(), self.sink)
     if getattr(self.sink, 'num_shards', 0):
         min_shards = self.sink.num_shards
         if min_shards == 1:
             keyed_pcoll = pcoll | core.Map(lambda x: (None, x))
         else:
             keyed_pcoll = pcoll | core.ParDo(_RoundRobinKeyFn(min_shards))
         write_result_coll = (
             keyed_pcoll
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             |
             'WriteBundles' >> core.ParDo(_WriteKeyedBundleDoFn(self.sink),
                                          AsSingleton(init_result_coll)))
     else:
         min_shards = 1
         write_result_coll = (
             pcoll
             | 'WriteBundles' >> core.ParDo(_WriteBundleDoFn(self.sink),
                                            AsSingleton(init_result_coll))
             | 'Pair' >> core.Map(lambda x: (None, x))
             | core.WindowInto(window.GlobalWindows())
             | core.GroupByKey()
             | 'Extract' >> core.FlatMap(lambda x: x[1]))
     return do_once | 'FinalizeWrite' >> core.FlatMap(
         _finalize_write, self.sink, AsSingleton(init_result_coll),
         AsIter(write_result_coll), min_shards)
Ejemplo n.º 2
0
  def expand(self, pbegin):
    from apache_beam.options.pipeline_options import DebugOptions
    from apache_beam.transforms import util

    assert isinstance(pbegin, pvalue.PBegin)
    self.pipeline = pbegin.pipeline

    debug_options = self.pipeline._options.view_as(DebugOptions)
    if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
      source = self.source

      def split_source(unused_impulse):
        total_size = source.estimate_size()
        if total_size:
          # 1MB = 1 shard, 1GB = 32 shards, 1TB = 1000 shards, 1PB = 32k shards
          chunk_size = max(1 << 20, 1000 * int(math.sqrt(total_size)))
        else:
          chunk_size = 64 << 20  # 64mb
        return source.split(chunk_size)

      return (
          pbegin
          | core.Impulse()
          | 'Split' >> core.FlatMap(split_source)
          | util.Reshuffle()
          | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
              split.source.get_range_tracker(
                  split.start_position, split.stop_position))))
    else:
      # Treat Read itself as a primitive.
      return pvalue.PCollection(self.pipeline)
Ejemplo n.º 3
0
 def expand(self, pbegin):
   return (
       pbegin
       | core.Impulse()
       | 'Split' >> core.FlatMap(lambda _: source.split(
           Read.get_desired_chunk_size(source.estimate_size())))
       | util.Reshuffle()
       | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
           split.source.get_range_tracker(
               split.start_position, split.stop_position))))
Ejemplo n.º 4
0
    def expand(self, pcoll):
        do_once = pcoll.pipeline | 'DoOnceSuccess' >> core.Create([None])
        main_write_result = pcoll | 'MainWrite' >> Write(self.sink)

        return (do_once
                | 'SuccessWrite' >> core.FlatMap(
                    self._success_write, pvalue.AsIter(main_write_result)))
Ejemplo n.º 5
0
    def expand(self, pbegin):
        from apache_beam.options.pipeline_options import DebugOptions
        from apache_beam.transforms import util

        assert isinstance(pbegin, pvalue.PBegin)
        self.pipeline = pbegin.pipeline

        debug_options = self.pipeline._options.view_as(DebugOptions)
        if debug_options.experiments and 'beam_fn_api' in debug_options.experiments:
            NUM_SPLITS = 1000
            source = self.source
            return (
                pbegin
                | core.Impulse()
                | 'Split' >> core.FlatMap(lambda _: source.split(NUM_SPLITS))
                | util.Reshuffle()
                | 'ReadSplits' >> core.FlatMap(lambda split: split.source.read(
                    split.source.get_range_tracker(split.start_position, split.
                                                   stop_position))))
        else:
            # Treat Read itself as a primitive.
            return pvalue.PCollection(self.pipeline)