def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / synthesizer.hparams.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * synthesizer.hparams.sample_rate)) ] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, synthesizer.sample_rate) # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.load_preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance( encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % np.random.randint(100000) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def add_real_utterance(self, wav, name, speaker_name): # Compute the mel spectrogram spec = synthesizer.make_spectrogram(wav) self.ui.draw_spec(spec, "current") # Compute the embedding if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.load_preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False) self.utterances.add(utterance) self.ui.register_utterance(utterance) # Plot it self.ui.draw_embed(embed, name, "current") self.ui.draw_umap_projections(self.utterances)