Example #1
0
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'),
                embed,
                allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)
Example #2
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        self.ui.save_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        if not speaker_name is None:
            name = speaker_name
        else:
            name = "unknown"
        name = name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Example #3
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Example #4
0
def transform_embed(wav, encoder_model_fpath=Path()):
    from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    return embed
Example #5
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = fpaths
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    return encoder.embed_utterance(wav)
    def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False):
        """Get PPG and Mel (+ optional F0) for an utterance.

        Args:
            data_utterance_path: The path to the data utterance protocol buffer.
            is_full_ppg: If True, will use the full PPGs.

        Returns:
            feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair.
        """
        utt = Utterance()
        fs, wav = wavfile.read(data_utterance_path)
        utt.fs = fs
        utt.wav = wav
        utt.ppg = get_ppg(data_utterance_path, self.ppg_deps)

        audio = torch.FloatTensor(utt.wav.astype(np.float32))
        fs = utt.fs

        if fs != self.stft.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                fs, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        # (1, n_mel_channels, T)
        acoustic_feats = self.stft.mel_spectrogram(audio_norm)
        # (n_mel_channels, T)
        acoustic_feats = torch.squeeze(acoustic_feats, 0)
        # (T, n_mel_channels)
        acoustic_feats = acoustic_feats.transpose(0, 1)
        
        #print("encoder model path", self.encoder_model_fpath)
        
        from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath)
        
        #wav = np.load(data_utterance_path)
        wav = encoder.preprocess_wav(data_utterance_path) # wav
        embed = encoder.embed_utterance(wav)
        #print("spkr embedding", embed)
        #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape)
        
        if is_full_ppg:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.ppg, acoustic_feats, embed]
        else:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.monophone_ppg, acoustic_feats, embed]
Example #7
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav = encoder.preprocess_wav(wav)
        embed = encoder.embed_utterance(wav)
        return embed
Example #8
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Example #9
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = embed_fpath = fpaths
    embed_fpath = embed_fpath.replace(".wav", ".npy")
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Example #10
0
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_fpath, embed_fpath = src

    if skip_existing and embed_fpath.is_file():
        return

    wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Example #11
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath, _ = fpaths
    # try:
    #     wav = np.load(wav_fpath)
    # except ValueError as e:
    #     print(e)
    #     wav = np.load(wav_fpath, allow_pickle=True)
    wav = encoder.preprocess_wav(wav_fpath)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
def signup(wav_fpath: Path, username, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    embed_fpath = signup_dir.joinpath(username + ".npy")
    wav = encoder.preprocess_wav(str(wav_fpath))
    embed = encoder.embed_utterance(wav)
    if os.path.exists(embed_fpath):
        old_embed = np.load(embed_fpath)
        embed = old_embed + embed
        embed /= np.linalg.norm(embed, 2)
        os.remove(embed_fpath)
    np.save(embed_fpath, embed, allow_pickle=False)
    print(username + " signed up.")
Example #13
0
def transform_embed(wav, encoder_model_fpath=Path()):
    # from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_ = encoder.preprocess_wav(wav)
    # Take segment
    segment_length = 2 * encoder.sampling_rate  # 随机选取2秒语音生成语音表示向量
    if len(wav_) > segment_length:
        max_audio_start = len(wav_) - segment_length
        audio_start = random.randint(0, max_audio_start)
        wav_ = wav_[audio_start:audio_start + segment_length]

    embed = encoder.embed_utterance(wav_)
    return embed
Example #14
0
def embed_utterance(fpaths, encoder_model_fpath, hparams):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    if embed_fpath.exists():
        return
    # wav = np.load(wav_fpath)
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Example #15
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav_ = encoder.preprocess_wav(wav)
        # Take segment
        segment_length = 2 * encoder.sampling_rate  # 随机选取2秒语音生成语音表示向量
        if len(wav_) > segment_length:
            max_audio_start = len(wav_) - segment_length
            audio_start = random.randint(0, max_audio_start)
            wav_ = wav_[audio_start:audio_start + segment_length]

        embed = encoder.embed_utterance(wav_)
        return embed
Example #16
0
def embed_utterance(fpaths, encoder_model_fpath, module_name, reject_list_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath, module_name=module_name)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)

    if(embed is None):
        with open(reject_list_fpath, 'a') as reject_file:
            reject_file.write(str(os.path.basename(embed_fpath))+'\n')
            reject_file.close()
    else:
        if(embed.shape[0]==128):
            embed = np.concatenate((embed, embed), axis=0)
        np.save(embed_fpath, embed, allow_pickle=False)
def signin(wav_or_wavpath, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav_or_wavpath)
    embed = encoder.embed_utterance(wav)
    embed = np.reshape(embed, [np.shape(embed)[0], 1])  # [emb_dim, 1]

    signed_spk_embs = list(signup_dir.glob("*.npy"))
    signed_spk_name = [_dir.stem for _dir in signed_spk_embs]
    signed_spk_embs = [np.load(str(_dir)) for _dir in signed_spk_embs]
    signed_spk_embs = np.array(signed_spk_embs)  # [n, emb_dim]

    print(signed_spk_name)
    print(np.shape(signed_spk_embs), np.shape(embed))
    similar_score = np.matmul(signed_spk_embs, embed)
    similar_score = np.reshape(similar_score, [-1])
    sim_id = np.argmax(similar_score)
    sim_name = signed_spk_name[sim_id]
    for name, score in zip(signed_spk_name, similar_score):
        print(name, score)
    print("\nMatching name: ", sim_name)
Example #18
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Initialize the vocoder model and make it determinstic, if user provides a seed
        if self.ui.random_seed_checkbox.isChecked():
            seed = self.synthesizer.set_seed(int(self.ui.seed_textbox.text()))
            self.ui.populate_gen_options(seed, self.trim_silences)
        else:
            seed = None

        if seed is not None:
            torch.manual_seed(seed)

        # Synthesize the waveform
        if not vocoder.is_loaded() or seed is not None:
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)
        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")
        
        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Trim excessive silences
        if self.ui.trim_silences_checkbox.isChecked():
            wav = encoder.preprocess_wav(wav)

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        # Name it (history displayed in combobox)
        # TODO better naming for the combobox items?
        wav_name = str(self.waves_count + 1)

        #Update waves combobox
        self.waves_count += 1
        if self.waves_count > MAX_WAVES:
          self.waves_list.pop()
          self.waves_namelist.pop()
        self.waves_list.insert(0, wav)
        self.waves_namelist.insert(0, wav_name)

        self.ui.waves_cb.disconnect()
        self.ui.waves_cb_model.setStringList(self.waves_namelist)
        self.ui.waves_cb.setCurrentIndex(0)
        self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav)

        # Update current wav
        self.set_current_wav(0)
        
        #Enable replay and save buttons:
        self.ui.replay_wav_button.setDisabled(False)
        self.ui.export_wav_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
        
        # Add the utterance
        name = speaker_name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)
        self.utterances.add(utterance)
        
        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Example #19
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        wav = None
        vocname = ""
        if self.ui.current_vocoder_fpath is not None:
            model_fpath = self.ui.current_vocoder_fpath
            vocname = Path(model_fpath).parent.stem
            if Path(model_fpath).parent.stem == "melgan":
                self.ui.log("Waveform generation with MelGAN... ")
                wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath)

            elif Path(model_fpath).parent.stem == "wavernn":
                self.ui.log("Waveform generation with WaveRNN... ")
                wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)

        if wav is None:
            vocname = "griffinlim"
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = self.ui.selected_utterance.name
        ftime = '{}'.format(time_formatter())
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fvoc = vocname
        fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext))
        audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_{}".format(time_formatter())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
Example #20
0
def embed_utterance(wav, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath, device="cpu")
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    return embed