def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor, in terms of frames: .. code-block: { "sequence_idx": tensor(shape=(S,)), "start_frame": tensor(shape=(S,)), "num_frames": tensor(shape=(S,)) } Where ``S`` is the total number of supervisions encountered in the :class:`CutSet`. Note that ``S`` might be different than the number of cuts (``B``). ``sequence_idx`` means the index of the corresponding feature matrix (or cut) in a batch. """ start_frames, nums_frames = zip(*(supervision_to_frames( sup, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts for sup in cut.supervisions)) sequence_idx = [i for i, c in enumerate(cuts) for s in c.supervisions] return { "sequence_idx": torch.tensor(sequence_idx, dtype=torch.int32), "start_frame": torch.tensor(start_frames, dtype=torch.int32), "num_frames": torch.tensor(nums_frames, dtype=torch.int32), }
def supervision_intervals(self, cuts: CutSet) -> Dict[str, torch.Tensor]: """ Returns a dict that specifies the start and end bounds for each supervision, as a 1-D int tensor, in terms of frames. """ start_frames, nums_frames = zip(*(supervision_to_frames( sup, self.extractor.frame_shift, cut.sampling_rate) for cut in cuts for sup in cut.supervisions)) return { 'start_frame': torch.tensor(start_frames, dtype=torch.int32), 'num_frames': torch.tensor(nums_frames, dtype=torch.int32) }
def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts: CutSet = self._collect_batch() # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) # Perform the padding (and possibly augmentation at the same time). if self.aug_cuts is not None: # Mix in the signal from the augmentation CutSet; use them as padding at the same time. cuts = cuts.mix(self.aug_cuts, duration=cuts[0].duration, snr=self.aug_snr, mix_prob=self.aug_prob) else: # We'll just pad it with low energy values to match the longest Cut's duration in the batch. cuts = cuts.pad() # Get a tensor with batched feature matrices, shape (B, T, F) features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate( [{ 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': start_frame, 'num_frames': num_frames } for sequence_idx, cut in enumerate(cuts) for supervision, ( start_frame, num_frames) in zip(cut.supervisions, ( supervision_to_frames(s, cut.frame_shift, cut.sampling_rate, max_frames=cut.num_frames) for s in cut.supervisions))]) } if self.return_cuts: batch['supervisions']['cut'] = [ cut for cut in cuts for sup in cut.supervisions ] return batch
def __getitem__( self, cut_ids: List[str]) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts = self.cuts.subset(cut_ids=cut_ids) # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) # Optional transforms. for tnfm in self.cut_transforms: cuts = tnfm(cuts) # Get a tensor with batched feature matrices, shape (B, T, F) # Collation performs auto-padding, if necessary. features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate( [{ 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': start_frame, 'num_frames': num_frames } for sequence_idx, cut in enumerate(cuts) for supervision, ( start_frame, num_frames) in zip(cut.supervisions, ( supervision_to_frames(s, cut.frame_shift, cut.sampling_rate, max_frames=cut.num_frames) for s in cut.supervisions))]) } if self.return_cuts: batch['supervisions']['cut'] = [ cut for cut in cuts for sup in cut.supervisions ] return batch