def test_split_respects_min_num_splits(self): range = OffsetRange(10, 100) splits = list(range.split(desired_num_offsets_per_split=5, min_num_offsets_per_split=25)) self.assertEqual(3, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 100), splits)
def process(self, element, *args, **kwargs): match_results = FileSystems.match([element]) for metadata in match_results[0].metadata_list: splittable = (self._splittable and _determine_splittability_from_compression_type( metadata.path, self._compression_type)) if splittable: for split in OffsetRange(0, metadata.size_in_bytes).split( self._desired_bundle_size, self._min_bundle_size): yield (metadata, split) else: yield (metadata, OffsetRange( 0, range_trackers.OffsetRangeTracker.OFFSET_INFINITY))
def test_split_respects_desired_num_splits(self): range = OffsetRange(10, 100) splits = list(range.split(desired_num_offsets_per_split=25)) self.assertEqual(4, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 85), splits) self.assertIn(OffsetRange(85, 100), splits)
def test_split_no_small_split_at_end(self): range = OffsetRange(10, 90) splits = list(range.split(desired_num_offsets_per_split=25)) self.assertEqual(3, len(splits)) self.assertIn(OffsetRange(10, 35), splits) self.assertIn(OffsetRange(35, 60), splits) self.assertIn(OffsetRange(60, 90), splits)
def split(self, desired_bundle_size, start_offset=None, stop_offset=None): if start_offset is None: start_offset = self._start_offset if stop_offset is None: stop_offset = self._stop_offset if self._splittable: splits = OffsetRange(start_offset, stop_offset).split(desired_bundle_size, self._min_bundle_size) for split in splits: yield iobase.SourceBundle( split.stop - split.start, _SingleFileSource( # Copying this so that each sub-source gets a fresh instance. pickler.loads(pickler.dumps(self._file_based_source)), self._file_name, split.start, split.stop, min_bundle_size=self._min_bundle_size, splittable=self._splittable), split.start, split.stop) else: # Returning a single sub-source with end offset set to OFFSET_INFINITY (so # that all data of the source gets read) since this source is # unsplittable. Choosing size of the file as end offset will be wrong for # certain unsplittable source, e.g., compressed sources. yield iobase.SourceBundle( stop_offset - start_offset, _SingleFileSource( self._file_based_source, self._file_name, start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY, min_bundle_size=self._min_bundle_size, splittable=self._splittable), start_offset, range_trackers.OffsetRangeTracker.OFFSET_INFINITY)
def test_create(self): OffsetRange(0, 10) OffsetRange(10, 100) with self.assertRaises(ValueError): OffsetRange(10, 9)