Ejemplo n.º 1
0
    def __call__(
        self, cuts: CutSet
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet]
    ]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.
        """
        audios, cuts = read_audio_from_cuts(
            cuts,
            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
            suppress_errors=self.fault_tolerant,
        )

        for tfnm in self.wave_transforms:
            for idx in range(len(audios)):
                audios[idx] = tfnm(audios[idx])

        if self.use_batch_extract:
            # Batch extraction is possibly faster depending on the implementation
            # of the feature extractor.
            assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts)
            features_single = self.extractor.extract_batch(
                audios, sampling_rate=cuts[0].sampling_rate
            )
        else:
            # Sequential extraction allows the sampling rates to be different.
            features_single = []
            for idx, cut in enumerate(cuts):
                samples = audios[idx].numpy()
                try:
                    features = self.extractor.extract(samples, cuts[idx].sampling_rate)
                except:
                    logging.error(
                        f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                    )
                    raise
                features_single.append(torch.from_numpy(features))

        features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON)

        feature_lens = torch.tensor(
            [
                compute_num_frames(
                    cut.duration, self.extractor.frame_shift, cut.sampling_rate
                )
                for cut in cuts
            ],
            dtype=torch.int32,
        )

        if self.fault_tolerant:
            return features_batch, feature_lens, cuts
        else:
            return features_batch, feature_lens
Ejemplo n.º 2
0
 def __getitem__(self, cuts: CutSet) -> torch.Tensor:
     self._validate(cuts)
     features = collate_matrices(
         cut.compute_features(
             extractor=self.feature_extractor,
             augment_fn=self.augment_fn,
         )
         for cut in cuts
     )
     return features
Ejemplo n.º 3
0
    def __getitem__(self, cuts: CutSet) -> torch.Tensor:
        self._validate(cuts)

        def generate_cut(cuts: CutSet):
            for cut in cuts:
                with suppress_audio_loading_errors():
                    yield cut.compute_features(
                        extractor=self.feature_extractor,
                        augment_fn=self.augment_fn,
                    )

        features = collate_matrices(generate_cut(cuts))
        return features
Ejemplo n.º 4
0
    def __getitem__(self, cuts: CutSet) -> torch.Tensor:
        self._validate(cuts)

        def generate_cut(cuts: CutSet):
            for cut in cuts:
                with suppress_and_warn(AudioLoadingError,
                                       DurationMismatchError):
                    yield cut.compute_features(
                        extractor=self.feature_extractor,
                        augment_fn=self.augment_fn,
                    )

        features = collate_matrices(generate_cut(cuts))
        return features
Ejemplo n.º 5
0
 def __getitem__(self, cut_ids: Iterable[str]) -> Dict[str, torch.Tensor]:
     cuts = self.cuts.subset(cut_ids=cut_ids)
     features, features_lens = collate_features(cuts)
     return {
         'features':
         features,
         'features_lens':
         features_lens,
         'speaker_activity':
         collate_matrices(
             (cut.speakers_feature_mask(
                 min_speaker_dim=self.min_speaker_dim,
                 speaker_to_idx_map=self.speakers,
             ) for cut in cuts),
             # In case padding is needed, we will add a special symbol
             # that tells the cross entropy loss to ignore the frame during scoring.
             padding_value=CrossEntropyLoss().ignore_index)
     }
Ejemplo n.º 6
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, torch.Tensor]:
     features, features_lens = collate_features(cuts)
     return {
         "features":
         features,
         "features_lens":
         features_lens,
         "speaker_activity":
         collate_matrices(
             (cut.speakers_feature_mask(
                 min_speaker_dim=self.min_speaker_dim,
                 speaker_to_idx_map=self.speakers,
             ) for cut in cuts),
             # In case padding is needed, we will add a special symbol
             # that tells the cross entropy loss to ignore the frame during scoring.
             padding_value=CrossEntropyLoss().ignore_index,
         ),
     }
Ejemplo n.º 7
0
    def __call__(
        self, cuts: CutSet
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet]
    ]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``.
            Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``.
            CutSet ``cuts`` is returned when ``fault_tolerant=True``.
        """
        audios, cuts = read_audio_from_cuts(
            cuts,
            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
            suppress_errors=self.fault_tolerant,
        )

        for tfnm in self.wave_transforms:
            for idx in range(len(audios)):
                audios[idx] = tfnm(audios[idx])

        if self.use_batch_extract:
            # Batch extraction is possibly faster depending on the implementation
            # of the feature extractor.
            assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts)
            features_single = self.extractor.extract_batch(
                audios, sampling_rate=cuts[0].sampling_rate
            )
        else:
            # Sequential extraction allows the sampling rates to be different.
            features_single = []
            for idx, cut in enumerate(cuts):
                samples = audios[idx].numpy()
                try:
                    features = self.extractor.extract(samples, cuts[idx].sampling_rate)
                except:
                    logging.error(
                        f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                    )
                    raise
                features_single.append(torch.from_numpy(features))

        features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON)

        feature_lens = torch.tensor(
            [
                compute_num_frames(
                    cut.duration, self.extractor.frame_shift, cut.sampling_rate
                )
                for cut in cuts
            ],
            dtype=torch.int64,
        )

        out = (features_batch, feature_lens)

        if self.return_audio:
            audios = [a.squeeze(0) for a in audios]  # (1, T) -> (T, )
            audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64)
            audios = collate_vectors(audios, padding_value=0)

            out = out + (audios, audio_lens)

        if self.fault_tolerant:
            out = out + (cuts,)

        return out