def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]: """ Reads the pre-computed features from disk/other storage. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.""" return collate_features(cuts)
def __getitem__(self, cuts: CutSet) -> Dict[str, Any]: self._validate(cuts) features, features_lens = collate_features(cuts) return { "cuts": cuts, "features": features, "features_lens": features_lens, }
def test_collate_feature_padding(): cuts = CutSet.from_json("test/fixtures/ljspeech/cuts.json") assert len(set(cut.num_frames for cut in cuts)) > 1 correct_pad = max(cut.num_frames for cut in cuts) features, features_lens = collate_features(cuts) assert features.shape[1] == correct_pad assert max(features_lens).item() == correct_pad
def test_specaugment_batch(num_feature_masks, num_frame_masks): cuts = CutSet.from_json('test/fixtures/ljspeech/cuts.json') feats, feat_lens = collate_features(cuts) tfnm = SpecAugment(p=1.0, time_warp_factor=10, features_mask_size=5, frames_mask_size=20, num_feature_masks=num_feature_masks, num_frame_masks=num_frame_masks) augmented = tfnm(feats) assert (feats != augmented).any()
def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts: CutSet = self._collect_batch() # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) # Perform the padding (and possibly augmentation at the same time). if self.aug_cuts is not None: # Mix in the signal from the augmentation CutSet; use them as padding at the same time. cuts = cuts.mix(self.aug_cuts, duration=cuts[0].duration, snr=self.aug_snr, mix_prob=self.aug_prob) else: # We'll just pad it with low energy values to match the longest Cut's duration in the batch. cuts = cuts.pad() # Get a tensor with batched feature matrices, shape (B, T, F) features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate( [{ 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': start_frame, 'num_frames': num_frames } for sequence_idx, cut in enumerate(cuts) for supervision, ( start_frame, num_frames) in zip(cut.supervisions, ( supervision_to_frames(s, cut.frame_shift, cut.sampling_rate, max_frames=cut.num_frames) for s in cut.supervisions))]) } if self.return_cuts: batch['supervisions']['cut'] = [ cut for cut in cuts for sup in cut.supervisions ] return batch
def __getitem__( self, cut_ids: List[str]) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts = self.cuts.subset(cut_ids=cut_ids) # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) # Optional transforms. for tnfm in self.cut_transforms: cuts = tnfm(cuts) # Get a tensor with batched feature matrices, shape (B, T, F) # Collation performs auto-padding, if necessary. features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate( [{ 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': start_frame, 'num_frames': num_frames } for sequence_idx, cut in enumerate(cuts) for supervision, ( start_frame, num_frames) in zip(cut.supervisions, ( supervision_to_frames(s, cut.frame_shift, cut.sampling_rate, max_frames=cut.num_frames) for s in cut.supervisions))]) } if self.return_cuts: batch['supervisions']['cut'] = [ cut for cut in cuts for sup in cut.supervisions ] return batch
def __getitem__(self, cut_ids: Iterable[str]) -> Dict[str, torch.Tensor]: cuts = self.cuts.subset(cut_ids=cut_ids) features, features_lens = collate_features(cuts) return { 'features': features, 'features_lens': features_lens, 'speaker_activity': collate_matrices( (cut.speakers_feature_mask( min_speaker_dim=self.min_speaker_dim, speaker_to_idx_map=self.speakers, ) for cut in cuts), # In case padding is needed, we will add a special symbol # that tells the cross entropy loss to ignore the frame during scoring. padding_value=CrossEntropyLoss().ignore_index) }
def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]: features, features_lens = collate_features(cuts) return { "features": features, "features_lens": features_lens, "speaker_activity": collate_matrices( (cut.speakers_feature_mask( min_speaker_dim=self.min_speaker_dim, speaker_to_idx_map=self.speakers, ) for cut in cuts), # In case padding is needed, we will add a special symbol # that tells the cross entropy loss to ignore the frame during scoring. padding_value=CrossEntropyLoss().ignore_index, ), }
def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]: """ Return a new batch, with the batch size automatically determined using the contraints of max_frames and max_cuts. """ # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames. # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.) cuts: CutSet = self._collect_batch() # For now, we'll just pad it with low energy values to match the longest Cut's # duration in the batch. We might want to do something more interesting here # later on - padding/mixing with noises, etc. cuts = cuts.sort_by_duration().pad() # Get a tensor with batched feature matrices, shape (B, T, F) features = collate_features(cuts) batch = { 'features': features, 'supervisions': default_collate([ { 'sequence_idx': sequence_idx, 'text': supervision.text, 'start_frame': compute_num_frames( supervision.start, frame_shift=cut.frame_shift, # Note: Rounding "floor" can sometimes result in one extra frame being included # in the left context; but it guarantees that we will never go out-of-bounds when # summing start_frame + num_frames. rounding=ROUND_FLOOR ), 'num_frames': compute_num_frames( supervision.duration, frame_shift=cut.frame_shift ) } for sequence_idx, cut in enumerate(cuts) for supervision in cut.supervisions ]) } if self.return_cuts: batch['supervisions']['cut'] = [cut for cut in cuts for sup in cut.supervisions] return batch