def add_real_utterance(self, wav, name, speaker_name): # Compute the mel spectrogram spec = Synthesizer.make_spectrogram(wav) self.ui.draw_spec(spec, "current") # Compute the embedding if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "current") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Initialize the vocoder model and make it determinstic, if user provides a seed if self.ui.random_seed_checkbox.isChecked(): seed = self.synthesizer.set_seed(int(self.ui.seed_textbox.text())) self.ui.populate_gen_options(seed, self.trim_silences) else: seed = None if seed is not None: torch.manual_seed(seed) # Synthesize the waveform if not vocoder.is_loaded() or seed is not None: self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Trim excessive silences if self.ui.trim_silences_checkbox.isChecked(): wav = encoder.preprocess_wav(wav) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) # Name it (history displayed in combobox) # TODO better naming for the combobox items? wav_name = str(self.waves_count + 1) #Update waves combobox self.waves_count += 1 if self.waves_count > MAX_WAVES: self.waves_list.pop() self.waves_namelist.pop() self.waves_list.insert(0, wav) self.waves_namelist.insert(0, wav_name) self.ui.waves_cb.disconnect() self.ui.waves_cb_model.setStringList(self.waves_namelist) self.ui.waves_cb.setCurrentIndex(0) self.ui.waves_cb.currentIndexChanged.connect(self.set_current_wav) # Update current wav self.set_current_wav(0) #Enable replay and save buttons: self.ui.replay_wav_button.setDisabled(False) self.ui.export_wav_button.setDisabled(False) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) wav = None vocname = "" if self.ui.current_vocoder_fpath is not None: model_fpath = self.ui.current_vocoder_fpath vocname = Path(model_fpath).parent.stem if Path(model_fpath).parent.stem == "melgan": self.ui.log("Waveform generation with MelGAN... ") wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath) elif Path(model_fpath).parent.stem == "wavernn": self.ui.log("Waveform generation with WaveRNN... ") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) if wav is None: vocname = "griffinlim" self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = self.ui.selected_utterance.name ftime = '{}'.format(time_formatter()) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fvoc = vocname fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext)) audio.save_wav(wav, self._out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_{}".format(time_formatter()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)