def __iter__(self): return iter( streaming_shuffle( iter(self.iterator), bufsize=self.buffer_size, rng=self.rng, ))
def __iter__(self) -> "DynamicCutSampler": if self._just_restored_state: return self self.rng = random.Random(self.seed + self.epoch) # Initiate iteration self.cuts_iter = [iter(cs) for cs in self.cuts] # Optionally shuffle if self.shuffle: self.cuts_iter = [ # Important -- every shuffler has a copy of RNG seeded in the same way, # so that they are reproducible. streaming_shuffle( cs, rng=random.Random(self.seed + self.epoch), bufsize=self.shuffle_buffer_size, ) for cs in self.cuts_iter ] # Apply filter predicate self.cuts_iter = Filter( iterator=zip(*self.cuts_iter), predicate=lambda tpl: all(self._filter_fn(c) for c in tpl), diagnostics=self.diagnostics, ) # Convert Iterable[Cut] -> Iterable[CutSet] self.cuts_iter = DurationBatcher( self.cuts_iter, max_duration=self.max_duration, max_cuts=self.max_cuts, drop_last=self.drop_last, diagnostics=self.diagnostics, strict=self.strict, ) self.cuts_iter = iter(self.cuts_iter) return self
def test_streaming_shuffle(datasize, bufsize): data = list(range(int(datasize))) shuffled = list( streaming_shuffle(iter(data), bufsize=int(bufsize), rng=random.Random(42))) assert len(data) == len(shuffled) assert len(shuffled) == len(set(shuffled)) assert data != shuffled
def __init__( self, *cuts: CutSet, max_duration: float, max_cuts: Optional[int] = None, num_buckets: int = 10, shuffle: bool = False, drop_last: bool = False, consistent_ids: bool = True, num_cuts_for_bins_estimate: int = 10000, buffer_size: int = 10000, shuffle_buffer_size: int = 20000, strict: bool = True, world_size: Optional[int] = None, rank: Optional[int] = None, seed: int = 0, ) -> None: """ :param cuts: one or more CutSets (when more than one, will yield tuples of CutSets as mini-batches) :param max_duration: The maximum total recording duration from ``cuts``. Note: with multiple CutSets, ``max_duration`` constraint applies only to the first CutSet. :param max_cuts: The maximum total number of ``cuts`` per batch. When only ``max_duration`` is specified, this sampler yields static batch sizes. :param num_buckets: how many buckets to create. :param shuffle: When ``True``, the cuts will be shuffled dynamically with a reservoir-sampling-based algorithm. Convenient when mini-batch loop is inside an outer epoch-level loop, e.g.: `for epoch in range(10): for batch in dataset: ...` as every epoch will see a different cuts order. :param drop_last: When ``True``, we will drop all incomplete batches. A batch is considered incomplete if it depleted a bucket before hitting the constraint such as max_duration, max_cuts, etc. :param consistent_ids: Only affects processing of multiple CutSets. When ``True``, at each sampling step we check cuts from all CutSets have the same ID (i.e., the first cut from every CutSet should have the same ID, same for the second, third, etc.). :param num_cuts_for_bins_estimate: We will draw this many cuts to estimate the duration bins for creating similar-duration buckets. Larger number means a better estimate to the data distribution, possibly at a longer init cost. :param buffer_size: How many cuts (or cut pairs, triplets) we hold at any time across all of the buckets. Increasing ``max_duration`` (batch_size) or ``num_buckets`` might require increasing this number. It will result in larger memory usage. :param shuffle_buffer_size: How many cuts (or cut pairs, triplets) are being held in memory a buffer used for streaming shuffling. Larger number means better randomness at the cost of higher memory usage. :param strict: When ``True``, for the purposes of determining dynamic batch size, we take the longest cut sampled so far and multiply its duration/num_frames/num_samples by the number of cuts currently in mini-batch to check if it exceeded max_duration/etc. This can help make the GPU memory usage more predictable when there is a large variance in cuts duration. :param world_size: Total number of distributed nodes. We will try to infer it by default. :param rank: Index of distributed node. We will try to infer it by default. :param seed: Random seed used to consistently shuffle the dataset across different processes. """ super().__init__(world_size=world_size, rank=rank, seed=seed) if not all(cs.is_lazy for cs in cuts if isinstance(cs, CutSet)): warnings.warn( "You are using DynamicBucketingSampler with an eagerly read CutSet. " "You won't see any memory/speed benefits with that setup. " "Either use 'CutSet.from_jsonl_lazy' to read the CutSet lazily, or use a BucketingSampler instead." ) self.cuts = cuts self.max_duration = max_duration self.max_cuts = max_cuts self.shuffle = shuffle self.drop_last = drop_last self.consistent_ids = consistent_ids self.num_cuts_for_bins_estimate = num_cuts_for_bins_estimate self.buffer_size = buffer_size self.shuffle_buffer_size = shuffle_buffer_size self.strict = strict self.rng = None if self.shuffle: cuts_for_bins_estimate = streaming_shuffle( iter(self.cuts[0]), rng=random.Random(self.seed), bufsize=self.shuffle_buffer_size, ) else: cuts_for_bins_estimate = self.cuts[0] self.duration_bins = estimate_duration_buckets( islice(cuts_for_bins_estimate, num_cuts_for_bins_estimate), num_buckets=num_buckets, )