예제 #1
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        if self._counts == 0:
            self._counts = self.source.client.counts_estimator(
                self.source.query)
        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = self._counts

        last_position = 0
        for offset in range(start_position, stop_position, self._batch_size):
            yield iobase.SourceBundle(weight=desired_bundle_size,
                                      source=self.source,
                                      start_position=offset,
                                      stop_position=self._batch_size)
            last_position = offset + self._batch_size

        yield iobase.SourceBundle(
            weight=desired_bundle_size,
            source=self.source,
            start_position=last_position + 1,
            stop_position=stop_position,
        )
예제 #2
0
    def split_range_subranges(self, sample_size_bytes, desired_bundle_size,
                              ranges):
        start_position = ranges.start_position()
        end_position = ranges.stop_position()
        start_key = start_position
        end_key = end_position

        split_ = float(desired_bundle_size) / float(sample_size_bytes)
        split_ = math.floor(split_ * 100) / 100
        size_portion = int(sample_size_bytes * split_)
        if split_ == 1 or (start_position == b'' or end_position == b''):
            yield iobase.SourceBundle(sample_size_bytes, self, start_position,
                                      end_position)
        else:
            size_portion = int(sample_size_bytes * split_)

            sum_portion = size_portion
            while sum_portion < sample_size_bytes:
                fraction_portion = float(sum_portion) / float(
                    sample_size_bytes)
                position = self.fraction_to_position(fraction_portion,
                                                     start_position,
                                                     end_position)
                end_key = position
                yield iobase.SourceBundle(size_portion, self, start_key,
                                          end_key)
                start_key = position
                sum_portion += size_portion
            last_portion = (sum_portion - size_portion)
            last_size = sample_size_bytes - last_portion
            yield iobase.SourceBundle(last_size, self, end_key, end_position)
예제 #3
0
파일: mongodbio.py 프로젝트: x1-/beam
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        start_position, stop_position = self._replace_none_positions(
            start_position, stop_position)

        desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024
        split_keys = self._get_split_keys(desired_bundle_size_in_mb,
                                          start_position, stop_position)

        bundle_start = start_position
        for split_key_id in split_keys:
            if bundle_start >= stop_position:
                break
            bundle_end = min(stop_position, split_key_id)
            yield iobase.SourceBundle(weight=desired_bundle_size_in_mb,
                                      source=self,
                                      start_position=bundle_start,
                                      stop_position=bundle_end)
            bundle_start = bundle_end
        # add range of last split_key to stop_position
        if bundle_start < stop_position:
            yield iobase.SourceBundle(weight=desired_bundle_size_in_mb,
                                      source=self,
                                      start_position=bundle_start,
                                      stop_position=stop_position)
예제 #4
0
 def split(self,
           desired_bundle_size,
           start_position=None,
           stop_position=None):
     if len(self._serialized_values) < 2:
         yield iobase.SourceBundle(weight=0,
                                   source=self,
                                   start_position=0,
                                   stop_position=len(
                                       self._serialized_values))
     else:
         if start_position is None:
             start_position = 0
         if stop_position is None:
             stop_position = len(self._serialized_values)
         avg_size_per_value = self._total_size // len(
             self._serialized_values)
         num_values_per_split = max(
             int(desired_bundle_size // avg_size_per_value), 1)
         start = start_position
         while start < stop_position:
             end = min(start + num_values_per_split, stop_position)
             remaining = stop_position - end
             # Avoid having a too small bundle at the end.
             if remaining < (num_values_per_split // 4):
                 end = stop_position
             sub_source = Create._create_source(
                 self._serialized_values[start:end], self._coder)
             yield iobase.SourceBundle(weight=(end - start),
                                       source=sub_source,
                                       start_position=0,
                                       stop_position=(end - start))
             start = end
예제 #5
0
 def split(self, desired_bundle_size, start_position=None,
           stop_position=None):
   # simply devides values into two bundles
   middle = len(self._values) / 2
   yield iobase.SourceBundle(0.5, TestConcatSource.DummySource(
       self._values[:middle]), None, None)
   yield iobase.SourceBundle(0.5, TestConcatSource.DummySource(
       self._values[middle:]), None, None)
예제 #6
0
 def split(self, desired_bundle_size, start_position=None, stop_position=None):
     logging.info("ReadFromBigtable split")
     sample_row_keys = self._getTable().sample_row_keys()
     start_key = b''
     for sample_row_key in sample_row_keys:
         yield iobase.SourceBundle(1, self, start_key, sample_row_key.row_key)
         start_key = sample_row_key.row_key
     if start_key != b'':
        yield iobase.SourceBundle(1, self, start_key, b'')
예제 #7
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024

        # for desired bundle size, if desired chunk size smaller than 1mb, use
        # MongoDB default split size of 1mb.
        if desired_bundle_size_in_mb < 1:
            desired_bundle_size_in_mb = 1

        is_initial_split = start_position is None and stop_position is None
        start_position, stop_position = self._replace_none_positions(
            start_position, stop_position)

        if self.bucket_auto:
            # Use $bucketAuto for bundling
            split_keys = []
            weights = []
            for bucket in self._get_auto_buckets(
                    desired_bundle_size_in_mb,
                    start_position,
                    stop_position,
                    is_initial_split,
            ):
                split_keys.append({"_id": bucket["_id"]["max"]})
                weights.append(bucket["count"])
        else:
            # Use splitVector for bundling
            split_keys = self._get_split_keys(desired_bundle_size_in_mb,
                                              start_position, stop_position)
            weights = itertools.cycle((desired_bundle_size_in_mb, ))

        bundle_start = start_position
        for split_key_id, weight in zip(split_keys, weights):
            if bundle_start >= stop_position:
                break
            bundle_end = min(stop_position, split_key_id["_id"])
            yield iobase.SourceBundle(
                weight=weight,
                source=self,
                start_position=bundle_start,
                stop_position=bundle_end,
            )
            bundle_start = bundle_end
        # add range of last split_key to stop_position
        if bundle_start < stop_position:
            # bucket_auto mode can come here if not split due to single document
            weight = 1 if self.bucket_auto else desired_bundle_size_in_mb
            yield iobase.SourceBundle(
                weight=weight,
                source=self,
                start_position=bundle_start,
                stop_position=stop_position,
            )
예제 #8
0
    def test_position_at_fration(self):
        ranges = [(0, 4), (4, 16), (16, 24), (24, 32)]
        source = ConcatSource([
            iobase.SourceBundle((range[1] - range[0]) / 32.,
                                RangeSource(*range), None, None)
            for range in ranges
        ])

        range_tracker = source.get_range_tracker()
        self.assertEqual(range_tracker.position_at_fraction(0), (0, 0))
        self.assertEqual(range_tracker.position_at_fraction(.01), (0, 1))
        self.assertEqual(range_tracker.position_at_fraction(.1), (0, 4))
        self.assertEqual(range_tracker.position_at_fraction(.125), (1, 4))
        self.assertEqual(range_tracker.position_at_fraction(.2), (1, 7))
        self.assertEqual(range_tracker.position_at_fraction(.7), (2, 23))
        self.assertEqual(range_tracker.position_at_fraction(.75), (3, 24))
        self.assertEqual(range_tracker.position_at_fraction(.8), (3, 26))
        self.assertEqual(range_tracker.position_at_fraction(1), (4, None))

        range_tracker = source.get_range_tracker((1, None), (3, None))
        self.assertEqual(range_tracker.position_at_fraction(0), (1, 4))
        self.assertEqual(range_tracker.position_at_fraction(.01), (1, 5))
        self.assertEqual(range_tracker.position_at_fraction(.5), (1, 14))
        self.assertEqual(range_tracker.position_at_fraction(.599), (1, 16))
        self.assertEqual(range_tracker.position_at_fraction(.601), (2, 17))
        self.assertEqual(range_tracker.position_at_fraction(1), (3, None))
예제 #9
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        self._validate_query()

        query = self.source.query
        partitions = []
        while True:
            match = re.match(self.PATTERN, query)
            if not match:
                break

            partition = match.group(1)
            query = query.replace(partition, "")
            partitions.append(partition)

        partitions.reverse()
        for p in partitions:
            partition = p.replace(",", "")
            yield iobase.SourceBundle(
                weight=desired_bundle_size,
                source=self.source,
                start_position=partition,
                stop_position="".join(partitions),
            )
예제 #10
0
  def split(self, desired_bundle_size, start_position=0, stop_position=None):
    # Performs initial splitting of SyntheticSource.
    #
    # Exact sizes and distribution of initial splits generated here depends on
    # the input specification of the SyntheticSource.

    if stop_position is None:
      stop_position = self._num_records
    if self._initial_splitting == 'zipf':
      desired_num_bundles = self._initial_splitting_num_bundles or math.ceil(
          float(self.estimate_size()) / desired_bundle_size)
      bundle_ranges = initial_splitting_zipf(
          start_position,
          stop_position,
          desired_num_bundles,
          self._initial_splitting_distribution_parameter,
          self._num_records)
    else:
      if self._initial_splitting_num_bundles:
        bundle_size_in_elements = max(
            1, int(self._num_records / self._initial_splitting_num_bundles))
      else:
        bundle_size_in_elements = (
            max(
                div_round_up(desired_bundle_size, self.element_size),
                int(math.floor(math.sqrt(self._num_records)))))
      bundle_ranges = []
      for start in range(start_position, stop_position,
                         bundle_size_in_elements):
        stop = min(start + bundle_size_in_elements, stop_position)
        bundle_ranges.append((start, stop))

    for start, stop in bundle_ranges:
      yield iobase.SourceBundle(stop - start, self, start, stop)
예제 #11
0
 def split(self, desired_bundle_size, start_position=None, end_position=None):
   start, end = self._normalize(start_position, end_position)
   for sub_start in range(start, end, desired_bundle_size):
     sub_end = min(self._end, sub_start + desired_bundle_size)
     yield iobase.SourceBundle(
         sub_end - sub_start,
         RangeSource(sub_start, sub_end, self._split_freq),
         None, None)
예제 #12
0
def create(factory, transform_id, transform_proto, parameter, consumers):
    source = pickler.loads(parameter.value)
    spec = operation_specs.WorkerRead(
        iobase.SourceBundle(1.0, source, None, None),
        [WindowedValueCoder(source.default_output_coder())])
    return factory.augment_oldstyle_op(
        operations.ReadOperation(transform_proto.unique_name, spec,
                                 factory.counter_factory,
                                 factory.state_sampler),
        transform_proto.unique_name, consumers)
예제 #13
0
 def _run_read_from(self, transform_node, source):
   """Used when this operation is the result of reading source."""
   if not isinstance(source, NativeSource):
     source = iobase.SourceBundle(1.0, source, None, None)
   output = transform_node.outputs[None]
   element_coder = self._get_coder(output)
   read_op = operation_specs.WorkerRead(source, output_coders=[element_coder])
   self.outputs[output] = len(self.map_tasks), 0, 0
   self.map_tasks.append([(transform_node.full_label, read_op)])
   return len(self.map_tasks) - 1
예제 #14
0
  def split(self, desired_bundle_size, start_offset=None, stop_offset=None):
    if start_offset is None:
      start_offset = self._start_offset
    if stop_offset is None:
      stop_offset = self._stop_offset

    if self._splittable:
      bundle_size = max(desired_bundle_size, self._min_bundle_size)

      bundle_start = start_offset
      while bundle_start < stop_offset:
        bundle_stop = min(bundle_start + bundle_size, stop_offset)
        yield iobase.SourceBundle(
            bundle_stop - bundle_start,
            _SingleFileSource(
                # Copying this so that each sub-source gets a fresh instance.
                pickler.loads(pickler.dumps(self._file_based_source)),
                self._file_name,
                bundle_start,
                bundle_stop,
                min_bundle_size=self._min_bundle_size,
                splittable=self._splittable),
            bundle_start,
            bundle_stop)
        bundle_start = bundle_stop
    else:
      # Returning a single sub-source with end offset set to OFFSET_INFINITY (so
      # that all data of the source gets read) since this source is
      # unsplittable. Choosing size of the file as end offset will be wrong for
      # certain unsplittable source, e.g., compressed sources.
      yield iobase.SourceBundle(
          stop_offset - start_offset,
          _SingleFileSource(
              self._file_based_source,
              self._file_name,
              start_offset,
              range_trackers.OffsetRangeTracker.OFFSET_INFINITY,
              min_bundle_size=self._min_bundle_size,
              splittable=self._splittable
          ),
          start_offset,
          range_trackers.OffsetRangeTracker.OFFSET_INFINITY
      )
def create(factory, transform_id, transform_proto, parameter, consumers):
    # The Dataflow runner harness strips the base64 encoding.
    source = pickler.loads(base64.b64encode(parameter))
    spec = operation_specs.WorkerRead(
        iobase.SourceBundle(1.0, source, None, None),
        [WindowedValueCoder(source.default_output_coder())])
    return factory.augment_oldstyle_op(
        operations.ReadOperation(transform_proto.unique_name, spec,
                                 factory.counter_factory,
                                 factory.state_sampler),
        transform_proto.unique_name, consumers)
예제 #16
0
    def _create_bundle_source(desired_bundle_size, source, ids):
        if isinstance(ids, list):
            ids_str = ",".join([f"'{id}'" for id in ids])
        elif isinstance(ids, str):
            ids_str = ids
        else:
            raise ValueError(f"Unexpected ids: {ids}")

        return iobase.SourceBundle(weight=desired_bundle_size,
                                   source=source,
                                   start_position=ids_str,
                                   stop_position=None)
예제 #17
0
  def split(self, desired_bundle_size, start_position=None, stop_position=None):
    assert start_position is None
    assert stop_position is None
    with open(self._file_name, 'rb') as f:
      f.seek(0, os.SEEK_END)
      size = f.tell()

    bundle_start = 0
    while bundle_start < size:
      bundle_stop = min(bundle_start + LineSource.TEST_BUNDLE_SIZE, size)
      yield iobase.SourceBundle(1, self, bundle_start, bundle_stop)
      bundle_start = bundle_stop
예제 #18
0
    def split_range_subranges(self, sample_size_bytes, desired_bundle_size,
                              ranges):
        ''' This method split the range you get using the
    ``desired_bundle_size`` as a limit size, It compares the
    size of the range and the ``desired_bundle size`` if it is necessary
    to split a range, it uses the ``fraction_to_position`` method.
    :param sample_size_bytes: The size of the Range.
    :param desired_bundle_size: The desired size to split the Range.
    :param ranges: the Range to split.
    '''
        start_position = ranges.start_position()
        end_position = ranges.stop_position()
        start_key = start_position
        end_key = end_position
        split_ = float(desired_bundle_size) / float(sample_size_bytes)
        split_ = math.floor(split_ * 100) / 100

        if split_ == 1 or (start_position == b'' or end_position == b''):
            yield iobase.SourceBundle(sample_size_bytes, self, start_position,
                                      end_position)
        else:
            size_portion = int(sample_size_bytes * split_)

            sum_portion = size_portion
            while sum_portion < sample_size_bytes:
                fraction_portion = float(sum_portion) / float(
                    sample_size_bytes)
                position = self.fraction_to_position(fraction_portion,
                                                     start_position,
                                                     end_position)
                end_key = position
                yield iobase.SourceBundle(long(size_portion), self, start_key,
                                          end_key)
                start_key = position
                sum_portion += size_portion
            last_portion = (sum_portion - size_portion)
            last_size = sample_size_bytes - last_portion
            yield iobase.SourceBundle(long(last_size), self, end_key,
                                      end_position)
예제 #19
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = OffsetRangeTracker.OFFSET_INFINITY

        yield iobase.SourceBundle(weight=desired_bundle_size,
                                  source=self.source,
                                  start_position=start_position,
                                  stop_position=stop_position)
예제 #20
0
파일: api_io.py 프로젝트: k4y3ff/dataflow
    def split(self, desired_bundle_size, start_position=0, stop_position=None):
        """
        Implements class: `apache_beam.io.iobase.BoundedSource.split`

        Because the source is unsplittable, only a single source is
        returned.
        """
        stop_position = range_trackers.OffsetRangeTracker.OFFSET_INFINITY
        yield iobase.SourceBundle(
            weight=1,
            source=self,
            start_position=start_position,
            stop_position=stop_position)
예제 #21
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        assert start_position is None
        assert stop_position is None
        size = self.estimate_size()

        bundle_start = 0
        while bundle_start < size:
            bundle_stop = min(bundle_start + LineSource.TEST_BUNDLE_SIZE, size)
            yield iobase.SourceBundle(bundle_stop - bundle_start, self,
                                      bundle_start, bundle_stop)
            bundle_start = bundle_stop
예제 #22
0
파일: utils.py 프로젝트: xsm110/Beam15.0
  def split(self, desired_bundle_size, start_position=None, stop_position=None):
    if start_position is None:
      start_position = 0
    if stop_position is None:
      stop_position = self._count

    bundle_start = start_position
    while bundle_start < stop_position:
      bundle_stop = min(stop_position, bundle_start + desired_bundle_size)
      yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                source=self,
                                start_position=bundle_start,
                                stop_position=bundle_stop)
      bundle_start = bundle_stop
예제 #23
0
    def split(self, desired_bundle_size, start_position=0, stop_position=None):
        # Performs initial splitting of SyntheticSource.
        #
        # Exact sizes and distribution of initial splits generated here depends on
        # the input specification of the SyntheticSource.

        if stop_position is None:
            stop_position = self._num_records
        if self._initial_splitting == 'zipf':
            desired_num_bundles = self._initial_splitting_num_bundles or math.ceil(
                float(self.estimate_size()) / desired_bundle_size)
            samples = np.random.zipf(
                self._initial_splitting_distribution_parameter,
                desired_num_bundles)
            total = sum(samples)
            relative_bundle_sizes = [(float(sample) / total)
                                     for sample in samples]
            bundle_ranges = []
            start = start_position
            index = 0
            while start < stop_position:
                if index == desired_num_bundles - 1:
                    bundle_ranges.append((start, stop_position))
                    break
                stop = start + int(
                    self._num_records * relative_bundle_sizes[index])
                bundle_ranges.append((start, stop))
                start = stop
                index += 1
        else:
            if self._initial_splitting_num_bundles:
                bundle_size_in_elements = max(
                    1,
                    int(self._num_records /
                        self._initial_splitting_num_bundles))
            else:
                bundle_size_in_elements = (max(
                    div_round_up(desired_bundle_size, self.element_size),
                    int(math.floor(math.sqrt(self._num_records)))))
            bundle_ranges = []
            for start in range(start_position, stop_position,
                               bundle_size_in_elements):
                stop = min(start + bundle_size_in_elements, stop_position)
                bundle_ranges.append((start, stop))

        for start, stop in bundle_ranges:
            yield iobase.SourceBundle(stop - start, self, start, stop)
예제 #24
0
    def split(self, desired_bundle_size, start_position=None, stop_position=None):
        """Implements :class:`~apache_beam.io.iobase.BoundedSource.split`
        This function will currently not be called, because the range tracker
        is unsplittable
        """
        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = range_trackers.OffsetRangeTracker.OFFSET_INFINITY

        # Because the source is unsplittable (for now), only a single source is
        # returned.
        yield iobase.SourceBundle(
            weight=1,
            source=self,
            start_position=start_position,
            stop_position=stop_position)
예제 #25
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        self._validate_query()

        match = re.match(self.PATTERN, self.source.query)
        start_date = datetime.strptime(match.group(1), "%Y-%m-%d")
        end_date = datetime.strptime(match.group(2), "%Y-%m-%d")

        months = self._diff_between_dates(start_date, end_date)
        for month in months:
            yield iobase.SourceBundle(
                weight=desired_bundle_size,
                source=self.source,
                start_position=month[0],
                stop_position=month[1],
            )
예제 #26
0
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):
        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = self._count

        bundle_start = start_position
        while bundle_start < self._count:
            bundle_stop = max(self._count, bundle_start + desired_bundle_size)
            print('bundle split')
            yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                      source=(self.full_path, self.age),
                                      start_position=bundle_start,
                                      stop_position=bundle_stop)
            bundle_start = bundle_stop
예제 #27
0
  def split(self, desired_bundle_size, start_position=None, stop_position=None):
    # use document cursor index as the start and stop positions
    if start_position is None:
      start_position = 0
    if stop_position is None:
      stop_position = self.doc_count

    # get an estimate on how many documents should be included in a split batch
    desired_bundle_count = desired_bundle_size // self.avg_doc_size

    bundle_start = start_position
    while bundle_start < stop_position:
      bundle_end = min(stop_position, bundle_start + desired_bundle_count)
      yield iobase.SourceBundle(weight=bundle_end - bundle_start,
                                source=self,
                                start_position=bundle_start,
                                stop_position=bundle_end)
      bundle_start = bundle_end
    def split(self,
              desired_bundle_size,
              start_position=None,
              stop_position=None):

        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = len(self._dataframe.index)

        bundle_start = start_position
        while bundle_start < len(self._dataframe.index):
            bundle_stop = max(len(self._dataframe.index),
                              bundle_start + desired_bundle_size)
            yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                      source=self,
                                      start_position=bundle_start,
                                      stop_position=bundle_stop)
            bundle_start = bundle_stop
예제 #29
0
    def split(
            self,
            desired_bundle_size,  # type: int
            start_position=None,  # type: Optional[Any]
            stop_position=None,  # type: Optional[Any]
    ):  # type: (...) -> Iterator[SourceBundle]
        if start_position is None:
            start_position = 0
        if stop_position is None:
            stop_position = self.count()

        bundle_start = start_position
        while bundle_start < stop_position:
            bundle_stop = min(stop_position,
                              bundle_start + desired_bundle_size)
            yield iobase.SourceBundle(weight=(bundle_stop - bundle_start),
                                      source=self,
                                      start_position=bundle_start,
                                      stop_position=bundle_stop)
            bundle_start = bundle_stop
예제 #30
0
    def split(self,
              desired_bundle_size=None,
              start_position=None,
              stop_position=None):
        if start_position or stop_position:
            raise ValueError(
                'Multi-level initial splitting is not supported. Expected start and '
                'stop positions to be None. Received %r and %r respectively.' %
                (start_position, stop_position))

        for index, source in enumerate(self._source_bundles):
            # We assume all sub-sources to produce bundles that specify weight using
            # the same unit. For example, all sub-sources may specify the size in
            # bytes as their weight.
            for bundle in source.source.split(desired_bundle_size,
                                              source.start_position,
                                              source.stop_position):
                yield iobase.SourceBundle(bundle.weight, bundle.source,
                                          (index, bundle.start_position),
                                          (index, bundle.stop_position))