コード例 #1
0
    def preprocess(self):
        wav = self.ui.selected_utterance.wav
        out = aukit.remove_noise(wav, sr=Synthesizer.sample_rate)
        hp = aukit.Dict2Obj({})
        hp["vad_window_length"] = 10  # milliseconds
        hp["vad_moving_average_width"] = 2
        hp["vad_max_silence_length"] = 2
        hp["audio_norm_target_dBFS"] = -32
        hp["sample_rate"] = 16000
        hp["int16_max"] = (2**15) - 1
        out = trim_long_silences(out, hparams=hp)

        spec = Synthesizer.make_spectrogram(out)
        self.ui.draw_align(spec[::-1], "current")
コード例 #2
0
    def synthesize(self):
        self.ui.log("Generating the mel spectrogram...")
        self.ui.set_loading(1)

        # Synthesize the spectrogram
        if self.synthesizer is None:
            model_dir = Path(self.ui.current_synthesizer_model_dir)
            checkpoints_dir = model_dir.joinpath("checkpoints")
            hp_path = model_dir.joinpath(
                "metas", "hparams.json")  # load from trained models
            if hp_path.exists():
                hparams = aukit.Dict2Obj(
                    json.load(open(hp_path, encoding="utf8")))
            else:
                hparams = None
            self.synthesizer = Synthesizer(checkpoints_dir,
                                           low_mem=self.low_mem,
                                           hparams=hparams)
        if not self.synthesizer.is_loaded():
            self.ui.log("Loading the synthesizer %s" %
                        self.synthesizer.checkpoint_fpath)

        ptext = self.ui.text_prompt.toPlainText()
        texts = ptext.split("\n")

        embed = self.ui.selected_utterance.embed
        embeds = np.stack([embed] * len(texts))
        specs, aligns = self.synthesizer.synthesize_spectrograms(
            texts, embeds, return_alignments=True)

        breaks = [spec.shape[1] for spec in specs]
        spec = np.concatenate(specs, axis=1)
        align = np.concatenate(aligns, axis=1)

        fref = self.ui.selected_utterance.name
        ftext = '。'.join(texts)
        ftime = '{}'.format(time_formatter())
        fname = filename_formatter('{}_{}_{}zi_{}.npy'.format(
            fref, ftime, len(ftext), ftext))
        np.save(self._out_mel_dir.joinpath(fname), spec,
                allow_pickle=False)  # save

        self.ui.draw_spec(spec, "generated")
        self.ui.draw_align(align, "generated")
        self.current_generated = (self.ui.selected_utterance.speaker_name,
                                  spec, breaks, None)
        self.ui.set_loading(0)
コード例 #3
0
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath, device='cpu')

    # 从模型目录导入hparams
    hp_path = args.syn_model_dir.parent.joinpath(
        "metas", "hparams.json")  # load from trained models
    if hp_path.exists():
        hparams = aukit.Dict2Obj(json.load(open(hp_path, encoding="utf8")))
        print('hparams:')
        print(
            json.dumps({k: v
                        for k, v in hparams.items()},
                       ensure_ascii=False,
                       indent=4))
    else:
        hparams = None
        print('hparams:', hparams)

    synthesizer = Synthesizer(args.syn_model_dir,
                              low_mem=args.low_mem,
                              hparams=hparams)

    # vocoder.load_model(args.voc_model_fpath)