Ejemplo n.º 1
0
    def __init__(self,
                 modules=None,
                 hparams=None,
                 run_opts=None,
                 freeze_params=True):

        # Arguments passed via the run opts dictionary. Set a limited
        # number of these, since some don't apply to inference.
        run_opt_defaults = {
            "device": "cpu",
            "data_parallel_count": -1,
            "data_parallel_backend": False,
            "distributed_launch": False,
            "distributed_backend": "nccl",
            "jit_module_keys": None,
        }
        for arg, default in run_opt_defaults.items():
            if run_opts is not None and arg in run_opts:
                setattr(self, arg, run_opts[arg])
            else:
                # If any arg from run_opt_defaults exist in hparams and
                # not in command line args "run_opts"
                if hparams is not None and arg in hparams:
                    setattr(self, arg, hparams[arg])
                else:
                    setattr(self, arg, default)

        # Put modules on the right device, accessible with dot notation
        self.modules = torch.nn.ModuleDict(modules)
        for mod in self.modules:
            self.modules[mod].to(self.device)

        for mod in self.MODULES_NEEDED:
            if mod not in modules:
                raise ValueError(f"Need modules['{mod}']")

        # Check MODULES_NEEDED and HPARAMS_NEEDED and
        # make hyperparams available with dot notation
        if self.HPARAMS_NEEDED and hparams is None:
            raise ValueError("Need to provide hparams dict.")
        if hparams is not None:
            # Also first check that all required params are found:
            for hp in self.HPARAMS_NEEDED:
                if hp not in hparams:
                    raise ValueError(f"Need hparams['{hp}']")
            self.hparams = SimpleNamespace(**hparams)

        # Prepare modules for computation, e.g. jit
        self._prepare_modules(freeze_params)

        # Audio normalization
        self.audio_normalizer = hparams.get("audio_normalizer",
                                            AudioNormalizer())
Ejemplo n.º 2
0
def main(argv):
    """Load the model, generate kernel and bandpass plots."""
    parser = get_parser()
    args = parser.parse_args(argv)

    if args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")

    if torch.cuda.is_available() and ("cuda" in args.device):
        device = args.device
    else:
        device = "cpu"

    if args.toolkit == "speechbrain":
        from speechbrain.dataio.preprocess import AudioNormalizer
        from speechbrain.pretrained import EncoderClassifier

        # Prepare spk2utt for mean x-vector
        spk2utt = dict()
        with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader:
            for line in reader:
                details = line.split()
                spk2utt[details[0]] = details[1:]

        # TODO(nelson): The model inference can be moved into functon.
        classifier = EncoderClassifier.from_hparams(
            source=args.pretrained_model, run_opts={"device": device})
        audio_norm = AudioNormalizer()

        wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"))
        os.makedirs(args.out_folder, exist_ok=True)
        writer_utt = kaldiio.WriteHelper(
            "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder))
        writer_spk = kaldiio.WriteHelper(
            "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(
                args.out_folder))

        for speaker in tqdm(spk2utt):
            xvectors = list()
            for utt in spk2utt[speaker]:
                in_sr, wav = wav_scp[utt]
                # Amp Normalization -1 ~ 1
                amax = np.amax(np.absolute(wav))
                wav = wav.astype(np.float32) / amax
                # Freq Norm
                wav = audio_norm(torch.from_numpy(wav), in_sr).to(device)
                # X-vector Embedding
                embeds = classifier.encode_batch(wav).detach().cpu().numpy()[0]
                writer_utt[utt] = np.squeeze(embeds)
                xvectors.append(embeds)

            # Speaker Normalization
            embeds = np.mean(np.stack(xvectors, 0), 0)
            writer_spk[speaker] = embeds
        writer_utt.close()
        writer_spk.close()

    elif args.toolkit == "espnet":
        raise NotImplementedError(
            "Follow details at: https://github.com/espnet/espnet/issues/3040")
    else:
        raise ValueError(
            f"Unkown type of toolkit. Only supported: speechbrain, espnet, kaldi"
        )