def split(self, desired_bundle_size, start_position=None, stop_position=None): if self._counts == 0: self._counts = self.source.client.counts_estimator( self.source.query) if start_position is None: start_position = 0 if stop_position is None: stop_position = self._counts last_position = 0 for offset in range(start_position, stop_position, self._batch_size): yield iobase.SourceBundle(weight=desired_bundle_size, source=self.source, start_position=offset, stop_position=self._batch_size) last_position = offset + self._batch_size yield iobase.SourceBundle( weight=desired_bundle_size, source=self.source, start_position=last_position + 1, stop_position=stop_position, )
def split_range_subranges(self, sample_size_bytes, desired_bundle_size, ranges): start_position = ranges.start_position() end_position = ranges.stop_position() start_key = start_position end_key = end_position split_ = float(desired_bundle_size) / float(sample_size_bytes) split_ = math.floor(split_ * 100) / 100 size_portion = int(sample_size_bytes * split_) if split_ == 1 or (start_position == b'' or end_position == b''): yield iobase.SourceBundle(sample_size_bytes, self, start_position, end_position) else: size_portion = int(sample_size_bytes * split_) sum_portion = size_portion while sum_portion < sample_size_bytes: fraction_portion = float(sum_portion) / float( sample_size_bytes) position = self.fraction_to_position(fraction_portion, start_position, end_position) end_key = position yield iobase.SourceBundle(size_portion, self, start_key, end_key) start_key = position sum_portion += size_portion last_portion = (sum_portion - size_portion) last_size = sample_size_bytes - last_portion yield iobase.SourceBundle(last_size, self, end_key, end_position)
def split(self, desired_bundle_size, start_position=None, stop_position=None): start_position, stop_position = self._replace_none_positions( start_position, stop_position) desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024 split_keys = self._get_split_keys(desired_bundle_size_in_mb, start_position, stop_position) bundle_start = start_position for split_key_id in split_keys: if bundle_start >= stop_position: break bundle_end = min(stop_position, split_key_id) yield iobase.SourceBundle(weight=desired_bundle_size_in_mb, source=self, start_position=bundle_start, stop_position=bundle_end) bundle_start = bundle_end # add range of last split_key to stop_position if bundle_start < stop_position: yield iobase.SourceBundle(weight=desired_bundle_size_in_mb, source=self, start_position=bundle_start, stop_position=stop_position)
def split(self, desired_bundle_size, start_position=None, stop_position=None): if len(self._serialized_values) < 2: yield iobase.SourceBundle(weight=0, source=self, start_position=0, stop_position=len( self._serialized_values)) else: if start_position is None: start_position = 0 if stop_position is None: stop_position = len(self._serialized_values) avg_size_per_value = self._total_size // len( self._serialized_values) num_values_per_split = max( int(desired_bundle_size // avg_size_per_value), 1) start = start_position while start < stop_position: end = min(start + num_values_per_split, stop_position) remaining = stop_position - end # Avoid having a too small bundle at the end. if remaining < (num_values_per_split // 4): end = stop_position sub_source = Create._create_source( self._serialized_values[start:end], self._coder) yield iobase.SourceBundle(weight=(end - start), source=sub_source, start_position=0, stop_position=(end - start)) start = end
def split(self, desired_bundle_size, start_position=None, stop_position=None): # simply devides values into two bundles middle = len(self._values) / 2 yield iobase.SourceBundle(0.5, TestConcatSource.DummySource( self._values[:middle]), None, None) yield iobase.SourceBundle(0.5, TestConcatSource.DummySource( self._values[middle:]), None, None)
def split(self, desired_bundle_size, start_position=None, stop_position=None): logging.info("ReadFromBigtable split") sample_row_keys = self._getTable().sample_row_keys() start_key = b'' for sample_row_key in sample_row_keys: yield iobase.SourceBundle(1, self, start_key, sample_row_key.row_key) start_key = sample_row_key.row_key if start_key != b'': yield iobase.SourceBundle(1, self, start_key, b'')
def split(self, desired_bundle_size, start_position=None, stop_position=None): desired_bundle_size_in_mb = desired_bundle_size // 1024 // 1024 # for desired bundle size, if desired chunk size smaller than 1mb, use # MongoDB default split size of 1mb. if desired_bundle_size_in_mb < 1: desired_bundle_size_in_mb = 1 is_initial_split = start_position is None and stop_position is None start_position, stop_position = self._replace_none_positions( start_position, stop_position) if self.bucket_auto: # Use $bucketAuto for bundling split_keys = [] weights = [] for bucket in self._get_auto_buckets( desired_bundle_size_in_mb, start_position, stop_position, is_initial_split, ): split_keys.append({"_id": bucket["_id"]["max"]}) weights.append(bucket["count"]) else: # Use splitVector for bundling split_keys = self._get_split_keys(desired_bundle_size_in_mb, start_position, stop_position) weights = itertools.cycle((desired_bundle_size_in_mb, )) bundle_start = start_position for split_key_id, weight in zip(split_keys, weights): if bundle_start >= stop_position: break bundle_end = min(stop_position, split_key_id["_id"]) yield iobase.SourceBundle( weight=weight, source=self, start_position=bundle_start, stop_position=bundle_end, ) bundle_start = bundle_end # add range of last split_key to stop_position if bundle_start < stop_position: # bucket_auto mode can come here if not split due to single document weight = 1 if self.bucket_auto else desired_bundle_size_in_mb yield iobase.SourceBundle( weight=weight, source=self, start_position=bundle_start, stop_position=stop_position, )
def test_position_at_fration(self): ranges = [(0, 4), (4, 16), (16, 24), (24, 32)] source = ConcatSource([ iobase.SourceBundle((range[1] - range[0]) / 32., RangeSource(*range), None, None) for range in ranges ]) range_tracker = source.get_range_tracker() self.assertEqual(range_tracker.position_at_fraction(0), (0, 0)) self.assertEqual(range_tracker.position_at_fraction(.01), (0, 1)) self.assertEqual(range_tracker.position_at_fraction(.1), (0, 4)) self.assertEqual(range_tracker.position_at_fraction(.125), (1, 4)) self.assertEqual(range_tracker.position_at_fraction(.2), (1, 7)) self.assertEqual(range_tracker.position_at_fraction(.7), (2, 23)) self.assertEqual(range_tracker.position_at_fraction(.75), (3, 24)) self.assertEqual(range_tracker.position_at_fraction(.8), (3, 26)) self.assertEqual(range_tracker.position_at_fraction(1), (4, None)) range_tracker = source.get_range_tracker((1, None), (3, None)) self.assertEqual(range_tracker.position_at_fraction(0), (1, 4)) self.assertEqual(range_tracker.position_at_fraction(.01), (1, 5)) self.assertEqual(range_tracker.position_at_fraction(.5), (1, 14)) self.assertEqual(range_tracker.position_at_fraction(.599), (1, 16)) self.assertEqual(range_tracker.position_at_fraction(.601), (2, 17)) self.assertEqual(range_tracker.position_at_fraction(1), (3, None))
def split(self, desired_bundle_size, start_position=None, stop_position=None): self._validate_query() query = self.source.query partitions = [] while True: match = re.match(self.PATTERN, query) if not match: break partition = match.group(1) query = query.replace(partition, "") partitions.append(partition) partitions.reverse() for p in partitions: partition = p.replace(",", "") yield iobase.SourceBundle( weight=desired_bundle_size, source=self.source, start_position=partition, stop_position="".join(partitions), )
def split(self, desired_bundle_size, start_position=0, stop_position=None): # Performs initial splitting of SyntheticSource. # # Exact sizes and distribution of initial splits generated here depends on # the input specification of the SyntheticSource. if stop_position is None: stop_position = self._num_records if self._initial_splitting == 'zipf': desired_num_bundles = self._initial_splitting_num_bundles or math.ceil( float(self.estimate_size()) / desired_bundle_size) bundle_ranges = initial_splitting_zipf( start_position, stop_position, desired_num_bundles, self._initial_splitting_distribution_parameter, self._num_records) else: if self._initial_splitting_num_bundles: bundle_size_in_elements = max( 1, int(self._num_records / self._initial_splitting_num_bundles)) else: bundle_size_in_elements = ( max( div_round_up(desired_bundle_size, self.element_size), int(math.floor(math.sqrt(self._num_records))))) bundle_ranges = [] for start in range(start_position, stop_position, bundle_size_in_elements): stop = min(start + bundle_size_in_elements, stop_position) bundle_ranges.append((start, stop)) for start, stop in bundle_ranges: yield iobase.SourceBundle(stop - start, self, start, stop)
def split(self, desired_bundle_size, start_position=None, end_position=None): start, end = self._normalize(start_position, end_position) for sub_start in range(start, end, desired_bundle_size): sub_end = min(self._end, sub_start + desired_bundle_size) yield iobase.SourceBundle( sub_end - sub_start, RangeSource(sub_start, sub_end, self._split_freq), None, None)
def create(factory, transform_id, transform_proto, parameter, consumers): source = pickler.loads(parameter.value) spec = operation_specs.WorkerRead( iobase.SourceBundle(1.0, source, None, None), [WindowedValueCoder(source.default_output_coder())]) return factory.augment_oldstyle_op( operations.ReadOperation(transform_proto.unique_name, spec, factory.counter_factory, factory.state_sampler), transform_proto.unique_name, consumers)
def _run_read_from(self, transform_node, source): """Used when this operation is the result of reading source.""" if not isinstance(source, NativeSource): source = iobase.SourceBundle(1.0, source, None, None) output = transform_node.outputs[None] element_coder = self._get_coder(output) read_op = operation_specs.WorkerRead(source, output_coders=[element_coder]) self.outputs[output] = len(self.map_tasks), 0, 0 self.map_tasks.append([(transform_node.full_label, read_op)]) return len(self.map_tasks) - 1
def split(self, desired_bundle_size, start_offset=None, stop_offset=None): if start_offset is None: start_offset = self._start_offset if stop_offset is None: stop_offset = self._stop_offset if self._splittable: bundle_size = max(desired_bundle_size, self._min_bundle_size) bundle_start = start_offset while bundle_start < stop_offset: bundle_stop = min(bundle_start + bundle_size, stop_offset) yield iobase.SourceBundle( bundle_stop - bundle_start, _SingleFileSource( # Copying this so that each sub-source gets a fresh instance. pickler.loads(pickler.dumps(self._file_based_source)), self._file_name, bundle_start, bundle_stop, min_bundle_size=self._min_bundle_size, splittable=self._splittable), bundle_start, bundle_stop) bundle_start = bundle_stop else: # Returning a single sub-source with end offset set to OFFSET_INFINITY (so # that all data of the source gets read) since this source is # unsplittable. Choosing size of the file as end offset will be wrong for # certain unsplittable source, e.g., compressed sources. yield iobase.SourceBundle( stop_offset - start_offset, _SingleFileSource( self._file_based_source, self._file_name, start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY, min_bundle_size=self._min_bundle_size, splittable=self._splittable ), start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY )
def create(factory, transform_id, transform_proto, parameter, consumers): # The Dataflow runner harness strips the base64 encoding. source = pickler.loads(base64.b64encode(parameter)) spec = operation_specs.WorkerRead( iobase.SourceBundle(1.0, source, None, None), [WindowedValueCoder(source.default_output_coder())]) return factory.augment_oldstyle_op( operations.ReadOperation(transform_proto.unique_name, spec, factory.counter_factory, factory.state_sampler), transform_proto.unique_name, consumers)
def _create_bundle_source(desired_bundle_size, source, ids): if isinstance(ids, list): ids_str = ",".join([f"'{id}'" for id in ids]) elif isinstance(ids, str): ids_str = ids else: raise ValueError(f"Unexpected ids: {ids}") return iobase.SourceBundle(weight=desired_bundle_size, source=source, start_position=ids_str, stop_position=None)
def split(self, desired_bundle_size, start_position=None, stop_position=None): assert start_position is None assert stop_position is None with open(self._file_name, 'rb') as f: f.seek(0, os.SEEK_END) size = f.tell() bundle_start = 0 while bundle_start < size: bundle_stop = min(bundle_start + LineSource.TEST_BUNDLE_SIZE, size) yield iobase.SourceBundle(1, self, bundle_start, bundle_stop) bundle_start = bundle_stop
def split_range_subranges(self, sample_size_bytes, desired_bundle_size, ranges): ''' This method split the range you get using the ``desired_bundle_size`` as a limit size, It compares the size of the range and the ``desired_bundle size`` if it is necessary to split a range, it uses the ``fraction_to_position`` method. :param sample_size_bytes: The size of the Range. :param desired_bundle_size: The desired size to split the Range. :param ranges: the Range to split. ''' start_position = ranges.start_position() end_position = ranges.stop_position() start_key = start_position end_key = end_position split_ = float(desired_bundle_size) / float(sample_size_bytes) split_ = math.floor(split_ * 100) / 100 if split_ == 1 or (start_position == b'' or end_position == b''): yield iobase.SourceBundle(sample_size_bytes, self, start_position, end_position) else: size_portion = int(sample_size_bytes * split_) sum_portion = size_portion while sum_portion < sample_size_bytes: fraction_portion = float(sum_portion) / float( sample_size_bytes) position = self.fraction_to_position(fraction_portion, start_position, end_position) end_key = position yield iobase.SourceBundle(long(size_portion), self, start_key, end_key) start_key = position sum_portion += size_portion last_portion = (sum_portion - size_portion) last_size = sample_size_bytes - last_portion yield iobase.SourceBundle(long(last_size), self, end_key, end_position)
def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = OffsetRangeTracker.OFFSET_INFINITY yield iobase.SourceBundle(weight=desired_bundle_size, source=self.source, start_position=start_position, stop_position=stop_position)
def split(self, desired_bundle_size, start_position=0, stop_position=None): """ Implements class: `apache_beam.io.iobase.BoundedSource.split` Because the source is unsplittable, only a single source is returned. """ stop_position = range_trackers.OffsetRangeTracker.OFFSET_INFINITY yield iobase.SourceBundle( weight=1, source=self, start_position=start_position, stop_position=stop_position)
def split(self, desired_bundle_size, start_position=None, stop_position=None): assert start_position is None assert stop_position is None size = self.estimate_size() bundle_start = 0 while bundle_start < size: bundle_stop = min(bundle_start + LineSource.TEST_BUNDLE_SIZE, size) yield iobase.SourceBundle(bundle_stop - bundle_start, self, bundle_start, bundle_stop) bundle_start = bundle_stop
def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count bundle_start = start_position while bundle_start < stop_position: bundle_stop = min(stop_position, bundle_start + desired_bundle_size) yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=self, start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop
def split(self, desired_bundle_size, start_position=0, stop_position=None): # Performs initial splitting of SyntheticSource. # # Exact sizes and distribution of initial splits generated here depends on # the input specification of the SyntheticSource. if stop_position is None: stop_position = self._num_records if self._initial_splitting == 'zipf': desired_num_bundles = self._initial_splitting_num_bundles or math.ceil( float(self.estimate_size()) / desired_bundle_size) samples = np.random.zipf( self._initial_splitting_distribution_parameter, desired_num_bundles) total = sum(samples) relative_bundle_sizes = [(float(sample) / total) for sample in samples] bundle_ranges = [] start = start_position index = 0 while start < stop_position: if index == desired_num_bundles - 1: bundle_ranges.append((start, stop_position)) break stop = start + int( self._num_records * relative_bundle_sizes[index]) bundle_ranges.append((start, stop)) start = stop index += 1 else: if self._initial_splitting_num_bundles: bundle_size_in_elements = max( 1, int(self._num_records / self._initial_splitting_num_bundles)) else: bundle_size_in_elements = (max( div_round_up(desired_bundle_size, self.element_size), int(math.floor(math.sqrt(self._num_records))))) bundle_ranges = [] for start in range(start_position, stop_position, bundle_size_in_elements): stop = min(start + bundle_size_in_elements, stop_position) bundle_ranges.append((start, stop)) for start, stop in bundle_ranges: yield iobase.SourceBundle(stop - start, self, start, stop)
def split(self, desired_bundle_size, start_position=None, stop_position=None): """Implements :class:`~apache_beam.io.iobase.BoundedSource.split` This function will currently not be called, because the range tracker is unsplittable """ if start_position is None: start_position = 0 if stop_position is None: stop_position = range_trackers.OffsetRangeTracker.OFFSET_INFINITY # Because the source is unsplittable (for now), only a single source is # returned. yield iobase.SourceBundle( weight=1, source=self, start_position=start_position, stop_position=stop_position)
def split(self, desired_bundle_size, start_position=None, stop_position=None): self._validate_query() match = re.match(self.PATTERN, self.source.query) start_date = datetime.strptime(match.group(1), "%Y-%m-%d") end_date = datetime.strptime(match.group(2), "%Y-%m-%d") months = self._diff_between_dates(start_date, end_date) for month in months: yield iobase.SourceBundle( weight=desired_bundle_size, source=self.source, start_position=month[0], stop_position=month[1], )
def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = self._count bundle_start = start_position while bundle_start < self._count: bundle_stop = max(self._count, bundle_start + desired_bundle_size) print('bundle split') yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=(self.full_path, self.age), start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop
def split(self, desired_bundle_size, start_position=None, stop_position=None): # use document cursor index as the start and stop positions if start_position is None: start_position = 0 if stop_position is None: stop_position = self.doc_count # get an estimate on how many documents should be included in a split batch desired_bundle_count = desired_bundle_size // self.avg_doc_size bundle_start = start_position while bundle_start < stop_position: bundle_end = min(stop_position, bundle_start + desired_bundle_count) yield iobase.SourceBundle(weight=bundle_end - bundle_start, source=self, start_position=bundle_start, stop_position=bundle_end) bundle_start = bundle_end
def split(self, desired_bundle_size, start_position=None, stop_position=None): if start_position is None: start_position = 0 if stop_position is None: stop_position = len(self._dataframe.index) bundle_start = start_position while bundle_start < len(self._dataframe.index): bundle_stop = max(len(self._dataframe.index), bundle_start + desired_bundle_size) yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=self, start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop
def split( self, desired_bundle_size, # type: int start_position=None, # type: Optional[Any] stop_position=None, # type: Optional[Any] ): # type: (...) -> Iterator[SourceBundle] if start_position is None: start_position = 0 if stop_position is None: stop_position = self.count() bundle_start = start_position while bundle_start < stop_position: bundle_stop = min(stop_position, bundle_start + desired_bundle_size) yield iobase.SourceBundle(weight=(bundle_stop - bundle_start), source=self, start_position=bundle_start, stop_position=bundle_stop) bundle_start = bundle_stop
def split(self, desired_bundle_size=None, start_position=None, stop_position=None): if start_position or stop_position: raise ValueError( 'Multi-level initial splitting is not supported. Expected start and ' 'stop positions to be None. Received %r and %r respectively.' % (start_position, stop_position)) for index, source in enumerate(self._source_bundles): # We assume all sub-sources to produce bundles that specify weight using # the same unit. For example, all sub-sources may specify the size in # bytes as their weight. for bundle in source.source.split(desired_bundle_size, source.start_position, source.stop_position): yield iobase.SourceBundle(bundle.weight, bundle.source, (index, bundle.start_position), (index, bundle.stop_position))