def __call__( self, cuts: CutSet ) -> Union[ Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet] ]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding. """ audios, cuts = read_audio_from_cuts( cuts, executor=_get_executor(self.num_workers, executor_type=self._executor_type), suppress_errors=self.fault_tolerant, ) for tfnm in self.wave_transforms: for idx in range(len(audios)): audios[idx] = tfnm(audios[idx]) if self.use_batch_extract: # Batch extraction is possibly faster depending on the implementation # of the feature extractor. assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts) features_single = self.extractor.extract_batch( audios, sampling_rate=cuts[0].sampling_rate ) else: # Sequential extraction allows the sampling rates to be different. features_single = [] for idx, cut in enumerate(cuts): samples = audios[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON) feature_lens = torch.tensor( [ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int32, ) if self.fault_tolerant: return features_batch, feature_lens, cuts else: return features_batch, feature_lens
def __getitem__(self, cuts: CutSet) -> torch.Tensor: self._validate(cuts) features = collate_matrices( cut.compute_features( extractor=self.feature_extractor, augment_fn=self.augment_fn, ) for cut in cuts ) return features
def __getitem__(self, cuts: CutSet) -> torch.Tensor: self._validate(cuts) def generate_cut(cuts: CutSet): for cut in cuts: with suppress_audio_loading_errors(): yield cut.compute_features( extractor=self.feature_extractor, augment_fn=self.augment_fn, ) features = collate_matrices(generate_cut(cuts)) return features
def __getitem__(self, cuts: CutSet) -> torch.Tensor: self._validate(cuts) def generate_cut(cuts: CutSet): for cut in cuts: with suppress_and_warn(AudioLoadingError, DurationMismatchError): yield cut.compute_features( extractor=self.feature_extractor, augment_fn=self.augment_fn, ) features = collate_matrices(generate_cut(cuts)) return features
def __getitem__(self, cut_ids: Iterable[str]) -> Dict[str, torch.Tensor]: cuts = self.cuts.subset(cut_ids=cut_ids) features, features_lens = collate_features(cuts) return { 'features': features, 'features_lens': features_lens, 'speaker_activity': collate_matrices( (cut.speakers_feature_mask( min_speaker_dim=self.min_speaker_dim, speaker_to_idx_map=self.speakers, ) for cut in cuts), # In case padding is needed, we will add a special symbol # that tells the cross entropy loss to ignore the frame during scoring. padding_value=CrossEntropyLoss().ignore_index) }
def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]: features, features_lens = collate_features(cuts) return { "features": features, "features_lens": features_lens, "speaker_activity": collate_matrices( (cut.speakers_feature_mask( min_speaker_dim=self.min_speaker_dim, speaker_to_idx_map=self.speakers, ) for cut in cuts), # In case padding is needed, we will add a special symbol # that tells the cross entropy loss to ignore the frame during scoring. padding_value=CrossEntropyLoss().ignore_index, ), }
def __call__( self, cuts: CutSet ) -> Union[ Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet] ]: """ Reads the audio samples from recordings on disk/other storage and computes their features. The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``. :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``. Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``. CutSet ``cuts`` is returned when ``fault_tolerant=True``. """ audios, cuts = read_audio_from_cuts( cuts, executor=_get_executor(self.num_workers, executor_type=self._executor_type), suppress_errors=self.fault_tolerant, ) for tfnm in self.wave_transforms: for idx in range(len(audios)): audios[idx] = tfnm(audios[idx]) if self.use_batch_extract: # Batch extraction is possibly faster depending on the implementation # of the feature extractor. assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts) features_single = self.extractor.extract_batch( audios, sampling_rate=cuts[0].sampling_rate ) else: # Sequential extraction allows the sampling rates to be different. features_single = [] for idx, cut in enumerate(cuts): samples = audios[idx].numpy() try: features = self.extractor.extract(samples, cuts[idx].sampling_rate) except: logging.error( f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}" ) raise features_single.append(torch.from_numpy(features)) features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON) feature_lens = torch.tensor( [ compute_num_frames( cut.duration, self.extractor.frame_shift, cut.sampling_rate ) for cut in cuts ], dtype=torch.int64, ) out = (features_batch, feature_lens) if self.return_audio: audios = [a.squeeze(0) for a in audios] # (1, T) -> (T, ) audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64) audios = collate_vectors(audios, padding_value=0) out = out + (audios, audio_lens) if self.fault_tolerant: out = out + (cuts,) return out