def synthesize(self): self.ui.log("Generating the mel spectrogram...") self.ui.set_loading(1) # Synthesize the spectrogram if self.synthesizer is None: model_dir = self.ui.current_synthesizer_model_dir checkpoints_dir = model_dir.joinpath("taco_pretrained") self.synthesizer = Synthesizer(checkpoints_dir, low_mem=self.low_mem) if not self.synthesizer.is_loaded(): self.ui.log("Loading the synthesizer %s" % self.synthesizer.checkpoint_fpath) texts = self.ui.text_prompt.toPlainText().split("\n") print(texts) texts = g2p(texts) print(texts) embed = self.ui.selected_utterance.embed embeds = np.stack([embed] * len(texts)) specs = self.synthesizer.synthesize_spectrograms(texts, embeds) breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) self.ui.draw_spec(spec, "generated") self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None) self.ui.set_loading(0)
def preprocess_speaker_sst(speaker_dir, out_dir: Path, skip_existing: bool, hparams): metadata = [] lines = [] texts = [] index = 1 with open(os.path.join(speaker_dir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split(',') #if (float(parts[2])>2 and float(parts[2])<7): lines.append(parts[0]) with open(os.path.join(speaker_dir, parts[1]), encoding='utf-8') as f2: for line2 in f2: txt_paths = line2 texts.append(txt_paths) #print(texts) #print(lines) texts = g2p(texts) #print(texts) #print(lines) for basename, text in zip(lines,texts): wav_path = os.path.join(speaker_dir, basename) wav, _ = librosa.load(wav_path, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #futures.append(partial(process_utterance, mel_dir, linear_dir, wav_dir, basename, wav_path, text, hparams)) basename2 = basename.strip().split('/') basename3 = "sl_"+basename2[0]+"_"+basename2[1]+"_"+basename2[2] #+"_"+basename2[2] #print(basename2[5]) metadata.append(process_utterance(wav, text, out_dir,basename3 , skip_existing, hparams)) index += 1 return [m for m in metadata if m is not None]
def preprocess_speaker2(speaker_dir, out_dir: Path, skip_existing: bool, hparams): metadata = [] wavs = [] texts = [] for book_dir in speaker_dir.glob("*"): # Gather the utterance audios and texts try: alignments_fpath = next(book_dir.glob("*.alignment.txt")) with alignments_fpath.open("r") as alignments_file: alignments = [line.rstrip().split(" ") for line in alignments_file] except StopIteration: # A few alignment files will be missing continue for wav_fname, words, end_times in alignments: wav_fpath = book_dir.joinpath(wav_fname + ".flac") assert wav_fpath.exists() words = words.replace("\"", "").split(",") end_times = list(map(float, end_times.replace("\"", "").split(","))) # Process each sub-utterance wav, text = split_on_silences(wav_fpath, words, end_times, hparams) texts.extend(text) wavs.extend(wav) texts = g2p(texts) for i, (wav, text) in enumerate(zip(wavs, texts)): sub_basename = "%s_%02d" % (wav_fname, i) metadata.append(process_utterance(wav, text, out_dir, sub_basename, skip_existing, hparams)) return [m for m in metadata if m is not None]
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams): metadata = [] lines = [] texts = [] index = 1 with open(os.path.join(speaker_dir, 'metadata.csv'), encoding='utf-8') as f: for line in f: parts = line.strip().split('|') basename = parts[0] text = parts[2] lines.append(basename) texts.append(text) texts = g2p(texts) for basename, text in zip(lines, texts): wav_path = os.path.join(speaker_dir, '{}.wav'.format(basename)) wav, _ = librosa.load(wav_path, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max metadata.append( process_utterance(wav, text, out_dir, basename, skip_existing, hparams)) index += 1 return [m for m in metadata if m is not None]
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") ## Generating the spectrogram # text = input("Write a sentence (+-20 words) to be synthesized:(Введите предложение для синтеза)\n") # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [args.text] texts = g2p(texts) print(texts) embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec)