def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform if not vocoder.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000 line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) if self.ui.current_vocoder_fpath is not None: self.ui.log("") wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) else: self.ui.log("Waveform generation with Griffin-Lim... ") wav = Synthesizer.griffin_lim(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, Synthesizer.sample_rate) fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name]) ftime = '{}'.format(int(time.time())) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / Synthesizer.sample_rate) fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext)) audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_%05d" % int(time.time()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step, plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams): # Save some results for evaluation attention_path = str( plot_dir.joinpath("attention_step_{}_sample_{}".format( step, sample_num))) save_attention(attention, attention_path) # save predicted mel spectrogram to disk (debug) mel_output_fpath = mel_output_dir.joinpath( "mel-prediction-step-{}_sample_{}.npy".format(step, sample_num)) np.save(str(mel_output_fpath), mel_prediction, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) wav_fpath = wav_dir.joinpath("step-{}-wave-from-mel_sample_{}.wav".format( step, sample_num)) audio.save_wav(wav, str(wav_fpath), sr=hparams.sample_rate) # save real and predicted mel-spectrogram plot to disk (control purposes) spec_fpath = plot_dir.joinpath( "step-{}-mel-spectrogram_sample_{}.png".format(step, sample_num)) title_str = "{}, {}, step={}, loss={:.5f}".format("Tacotron", time_string(), step, loss) plot_spectrogram(mel_prediction, str(spec_fpath), title=title_str, target_spectrogram=target_spectrogram, max_len=target_spectrogram.size // hparams.num_mels) print("Input at step {}: {}".format(step, sequence_to_text(input_seq)))
def run_eval_part1(args): syn_ckpt = args.syn_checkpoint speaker_name = args.speaker_name eval_results_dir = os.path.join(args.eval_results_dir, speaker_name) if not os.path.exists(eval_results_dir): os.makedirs(eval_results_dir) speaker_id = int(speaker_name) sentences = [ "Either measure the temperature with a bath thermometer or test the water with your wrist", "A test is a deliberate action or experiment to find out how well something works", "This was demonstrated in a laboratory experiment with rats", "This evidence supports the view that there is too much violence on television", ] # sentences = [sen.upper() for sen in sentences] print('eval part1> model: %s.' % syn_ckpt) syner = syn_infer.Synthesizer(syn_ckpt) ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt) ckpt_step = "step-"+str(ckpt_step.group(1)) if ckpt_step else syn_ckpt speaker_embed = np.eye(251, dtype=np.float32)[speaker_id] for i, text in enumerate(sentences): path = os.path.join(eval_results_dir, "%s-%s-eval-%03d_%s.wav" % (speaker_name, ckpt_step, i, "lpcnet")) print('[{:<10}]: {}'.format('processing', path)) mel_spec = syner.synthesize_spectrograms([text], [speaker_embed])[ 0] # batch synthesize print('[{:<10}]:'.format('text:'), text) print(np.shape(mel_spec)) # mel_spec is world output feat ############# # f0, sp, ap = np.split(mel_spec, [1, 514]) # # f0 *= 100.0 # # sp /= 1000.0 # f0 = np.ascontiguousarray(f0.T, dtype=np.float64) # sp = np.ascontiguousarray(sp.T, dtype=np.float64) # ap = np.ascontiguousarray(ap.T, dtype=np.float64) # f0 = np.squeeze(f0, -1) # print(np.shape(f0), np.shape(sp), np.shape(ap)) # wav = pw.synthesize(f0, sp, ap, hparams.sample_rate) ########## lf0 = mel_spec[:, :, 0] mgc = mel_spec[:, :, 1:1 + hparams.n_mgc] bap = mel_spec[:, :, 1 + hparams.n_mgc:] wav = audio.synthesize(lf0, mgc, bap) audio.save_wav(wav, path, hparams.sample_rate)
def record(self): wav = self.ui.record_one(encoder.sampling_rate, 5) if wav is None: return self.ui.play(wav, encoder.sampling_rate) speaker_name = "user01" name = speaker_name + "_rec_%d" % int(time.time()) audio.save_wav(wav, _out_record_dir.joinpath(name + '.wav'), encoder.sampling_rate) # save self.add_real_utterance(wav, name, speaker_name)
def run_mel_strip(): import numpy as np from tools.spec_processor import find_endpoint, find_silences from synthesizer.audio import inv_mel_spectrogram, save_wav from synthesizer.hparams import hparams from matplotlib import pyplot as plt inpath = Path( r'E:\lab\zhrtvc\zhrtvc\toolbox\saved_files\mels\wavs-P00173I-001_20170001P00173I0068.wav_1567509749_我家朵朵是世界上最漂亮的朵朵。。知道自己是什么样的人。要做什么。无需活在别人非议或期待里。你勤奋.npy' ) data = np.load(inpath) data = data.T print(data.shape) end_idx = find_silences(data, min_silence_sec=0.5, hop_silence_sec=0.2) print(end_idx, len(data)) out_dir = Path(r'data/syns') for i, pair in enumerate(zip(end_idx[:-1], end_idx[1:]), 1): a, b = pair wav = inv_mel_spectrogram(data[a[-1]:b[0]].T, hparams) save_wav(wav, out_dir.joinpath(f'sil-{i:02d}.wav'), hparams.sample_rate) plt.imshow(data.T) plt.colorbar() plt.show()
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, "taco_pretrained") plot_dir = os.path.join(log_dir, "plots") wav_dir = os.path.join(log_dir, "wavs") mel_dir = os.path.join(log_dir, "mel-spectrograms") eval_dir = os.path.join(log_dir, "eval-dir") eval_plot_dir = os.path.join(eval_dir, "plots") eval_wav_dir = os.path.join(eval_dir, "wavs") tensorboard_dir = os.path.join(log_dir, "tacotron_events") meta_folder = os.path.join(log_dir, "metas") os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) os.makedirs(meta_folder, exist_ok=True) checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") if hparams.if_use_speaker_classifier: metadat_fpath = os.path.join(args.synthesizer_root, "train_augment_speaker.txt") else: metadat_fpath = os.path.join(args.synthesizer_root, "train.txt") log("Checkpoint path: {}".format(checkpoint_fpath)) log("Loading training data from: {}".format(metadat_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope("datafeeder") as scope: feeder = Feeder(coord, metadat_fpath, hparams) # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) eval_model = model_test_mode(args, feeder, hparams, global_step) # Embeddings metadata char_embedding_meta = os.path.join(meta_folder, "CharacterEmbeddings.tsv") if not os.path.isfile(char_embedding_meta): with open(char_embedding_meta, "w", encoding="utf-8") as f: for symbol in symbols: if symbol == " ": symbol = "\\s" # For visual purposes, swap space with \s f.write("{}\n".format(symbol)) char_embedding_meta = char_embedding_meta.replace(log_dir, "..") # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=5) log("Tacotron training set to a maximum of {} steps".format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log("No model to load at {}".format(save_dir), slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) # initializing feeder feeder.start_threads(sess) # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, adversial_loss, opt = sess.run([ global_step, model.loss, model.adversial_loss, model.optimize ]) loss -= adversial_loss time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}, adv_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average, adversial_loss) log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) print(message) if loss > 100 or np.isnan(loss): log("Loss exploded to {:.5f} at step {}".format( loss, step)) raise Exception("Loss exploded") if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: # Run eval and save eval stats log("\nRunning evaluation at step {}".format(step)) eval_losses = [] before_losses = [] after_losses = [] stop_token_losses = [] linear_losses = [] linear_loss = None adversial_losses = [] if hparams.predict_linear: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, linear_loss, mel_p, \ mel_t, t_len, align, lin_p, lin_t = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_linear_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0], eval_model.tower_linear_outputs[0][0], eval_model.tower_linear_targets[0][0], ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) linear_losses.append(linear_loss) linear_loss = sum(linear_losses) / len(linear_losses) wav = audio.inv_linear_spectrogram(lin_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-linear.wav".format( step)), sr=hparams.sample_rate) else: for i in tqdm(range(feeder.test_steps)): eloss, before_loss, after_loss, stop_token_loss, adversial_loss, mel_p, mel_t, t_len,\ align = sess.run( [ eval_model.tower_loss[0], eval_model.tower_before_loss[0], eval_model.tower_after_loss[0], eval_model.tower_stop_token_loss[0], eval_model.tower_adversial_loss[0], eval_model.tower_mel_outputs[0][0], eval_model.tower_mel_targets[0][0], eval_model.tower_targets_lengths[0][0], eval_model.tower_alignments[0][0] ]) eval_losses.append(eloss) before_losses.append(before_loss) after_losses.append(after_loss) stop_token_losses.append(stop_token_loss) adversial_losses.append(adversial_loss) eval_loss = sum(eval_losses) / len(eval_losses) before_loss = sum(before_losses) / len(before_losses) after_loss = sum(after_losses) / len(after_losses) stop_token_loss = sum(stop_token_losses) / len( stop_token_losses) adversial_loss = sum(adversial_losses) / len( adversial_losses) log("Saving eval log to {}..".format(eval_dir)) # Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) audio.save_wav( wav, os.path.join( eval_wav_dir, "step-{}-eval-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) plot.plot_alignment( align, os.path.join(eval_plot_dir, "step-{}-eval-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), max_len=t_len // hparams.outputs_per_step) plot.plot_spectrogram( mel_p, os.path.join( eval_plot_dir, "step-{" "}-eval-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=mel_t, max_len=t_len) if hparams.predict_linear: plot.plot_spectrogram( lin_p, os.path.join( eval_plot_dir, "step-{}-eval-linear-spectrogram.png".format( step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, eval_loss), target_spectrogram=lin_t, max_len=t_len, auto_aspect=True) log("Eval loss for global step {}: {:.3f}".format( step, eval_loss)) log("Writing eval summary!") add_eval_stats(summary_writer, step, linear_loss, before_loss, after_loss, stop_token_loss, adversial_loss, eval_loss) if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ step == 300: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.." ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, "step-{}-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) #log("Input at step {}: {}".format(step, sequence_to_text(input_seq))) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) # Update Projector #log("\nSaving Model Character Embeddings visualization..") #add_embedding_stats(summary_writer, [model.embedding_table.name], # [char_embedding_meta], # checkpoint_state.model_checkpoint_path) #log("Tacotron Character embeddings have been updated on tensorboard!") log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) feed_dict[self.speaker_embeddings] = [ np.load(f) for f in embed_filenames ] if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: raise NotImplemented() saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, "plots/alignment-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])), title="{}".format(texts[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-linear.wav".format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, "plots/linear-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths
def run_eval_part1(args): speaker_enc_ckpt = args.speaker_encoder_checkpoint syn_ckpt = args.syn_checkpoint speaker_name = args.speaker_name eval_results_dir = os.path.join(args.eval_results_dir, speaker_name) if not os.path.exists(eval_results_dir): os.makedirs(eval_results_dir) speaker_audio_dirs = { "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"], "biaobei_speaker": [ "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav", ], "SLR68_DEV_3756_22": [ "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203118.wav", "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203322.wav", "/home/zhangwenbo5/lihongfeng/corpus/SLR68/dev/37_5622/37_5622_20170913203824.wav" ], "SLR38_P00001A": [ "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0001.wav", "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0002.wav", "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0003.wav", "/home/zhangwenbo5/lihongfeng/corpus/SLR38/ST-CMDS-20170001_1-OS/20170001P00001A0004.wav", ], "aishell_C0002": [ "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav", ], "aishell_C0896": [ "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav", ], }[speaker_name] sentences = [ # '美国主持人听到“中国”就插话', # '勉励乡亲们为过上更加幸福美好的生活继续团结奋斗。', # '中国基建领域又来了一款“神器”, 哪里不平平哪里', # '违反中央八项规定精神和廉洁纪律,违规出入私人会所和打高尔夫球', # '陪审团未能就其盗窃和藏匿文物罪名作出裁决', # '于美国首都华盛顿国家记者俱乐部召开的新闻发布会上说', # '杭州市卫健委某直属单位一名拟提副处级干部刘某公示期间,纪检监察组照例对其个人重大事项进行抽查', # '我国森林面积、森林蓄积分别增长一倍左右,人工林面积居全球第一', # '打打打打打打打打打打打', # '卡尔普陪外孙玩滑梯。', # '假语村言,别再拥抱我。', # '宝马配挂跛骡鞍,貂蝉怨枕董翁榻。', # '中国地震台网速报,' # '中国地震台网正式测定,', # '06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震', # '中国地震台网速报,中国地震台网正式测定:06月04日17时46分在台湾台东县海域(北纬22.82度,东经121.75度)发生5.8级地震', # '震源深度9千米,震中位于海中,距台湾岛最近约47公里。', # '刚刚,台湾发生5.8级地震,与此同时,泉州厦门漳州震感明显,', # '此次台湾地震发生后,许多网友为同胞祈福,愿平安,', '新世界百货望京店', '全聚德烤鸭店王府井店', '麻烦帮我把空调温度调整到二十四', '请帮我显示中央一套', # aishell IC0896W0001.wav '确定下载三帝狂野飙车', # aishell IC0896W0002.wav '请帮我开启深圳卫视国际频道', # aishell IC0896W0003.wav '您吃饭了吗,我今天吃的太撑了', '您吃饭了吗?', '你多大了,你到底多大了,我猜你三十了,他多大了,他到底多大了,他猜你三十了', '二毛你今天沒课嘛还和李霞聊天', ] text2pinyin = partial(get_pinyin, std=True, pb=True) sentences = [' '.join(text2pinyin(sent)) for sent in sentences] print('eval part1> model: %s.' % syn_ckpt) syner = syn_infer.Synthesizer(syn_ckpt) encoder_infer.load_model(speaker_enc_ckpt) ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt) ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt speaker_audio_wav_list = [ encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs ] speaker_audio_wav = np.concatenate(speaker_audio_wav_list) print(os.path.join(eval_results_dir, '000_refer_speaker_audio.wav')) audio.save_wav( speaker_audio_wav, os.path.join(eval_results_dir, '000_refer_speaker_audio.wav'), hparams.sample_rate) speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav) for i, text in enumerate(sentences): path = os.path.join(eval_results_dir, "%s-eval-%03d.wav" % (ckpt_step, i)) print('[{:<10}]: {}'.format('processing', path)) mel_spec = syner.synthesize_spectrograms( [text], [speaker_embed])[0] # batch synthesize print('[{:<10}]:'.format('text:'), text) # print(np.shape(mel_spec)) wav = syner.griffin_lim(mel_spec) audio.save_wav(wav, path, hparams.sample_rate)
def train(log_dir, args, hparams): save_dir = os.path.join(log_dir, "taco_pretrained") plot_dir = os.path.join(log_dir, "plots") wav_dir = os.path.join(log_dir, "wavs") mel_dir = os.path.join(log_dir, "mel-spectrograms") eval_dir = os.path.join(log_dir, "eval-dir") eval_plot_dir = os.path.join(eval_dir, "plots") eval_wav_dir = os.path.join(eval_dir, "wavs") tensorboard_dir = os.path.join(log_dir, "tacotron_events") os.makedirs(save_dir, exist_ok=True) os.makedirs(plot_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(eval_dir, exist_ok=True) os.makedirs(eval_plot_dir, exist_ok=True) os.makedirs(eval_wav_dir, exist_ok=True) os.makedirs(tensorboard_dir, exist_ok=True) checkpoint_fpath = os.path.join(save_dir, "tacotron_model.ckpt") log("Checkpoint path: {}".format(checkpoint_fpath)) log("Using model: Tacotron") log(hparams_debug_string()) # Start by setting a seed for repeatability tf.set_random_seed(hparams.tacotron_random_seed) # Set up data feeder coord = tf.train.Coordinator() with tf.variable_scope("datafeeder") as scope: feeder = Feeder(coord, hparams) # Set up model: global_step = tf.Variable(0, name="global_step", trainable=False) model, stats = model_train_mode(args, feeder, hparams, global_step) #eval_model = model_test_mode(args, feeder, hparams, global_step) # Book keeping step = 0 time_window = ValueWindow(100) loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=2) log("Tacotron training set to a maximum of {} steps".format( args.tacotron_train_steps)) # Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True # Train with tf.Session(config=config) as sess: try: summary_writer = tf.summary.FileWriter(tensorboard_dir, sess.graph) sess.run(tf.global_variables_initializer()) # saved model restoring if args.restore: # Restore saved model if the user requested it, default = True try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if checkpoint_state and checkpoint_state.model_checkpoint_path: log("Loading checkpoint {}".format( checkpoint_state.model_checkpoint_path), slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: log("No model to load at {}".format(save_dir), slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) except tf.errors.OutOfRangeError as e: log("Cannot restore checkpoint: {}".format(e), slack=True) else: log("Starting new training!", slack=True) saver.save(sess, checkpoint_fpath, global_step=global_step) # initializing feeder feeder.start_threads(sess) print("Feeder is intialized and model is ready to train.......") # Training loop while not coord.should_stop() and step < args.tacotron_train_steps: start_time = time.time() step, loss, opt = sess.run( [global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) message = "Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]".format( step, time_window.average, loss, loss_window.average) log(message, end="\r", slack=(step % args.checkpoint_interval == 0)) print(message) if loss > 100 or np.isnan(loss): log("Loss exploded to {:.5f} at step {}".format( loss, step)) raise Exception("Loss exploded") if step % args.summary_interval == 0: log("\nWriting summary at step {}".format(step)) summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: pass if step % args.checkpoint_interval == 0 or step == args.tacotron_train_steps or \ step == 300: # Save model and current global step saver.save(sess, checkpoint_fpath, global_step=global_step) log("\nSaving alignment, Mel-Spectrograms and griffin-lim inverted waveform.." ) input_seq, mel_prediction, alignment, target, target_length = sess.run( [ model.tower_inputs[0][0], model.tower_mel_outputs[0][0], model.tower_alignments[0][0], model.tower_mel_targets[0][0], model.tower_targets_lengths[0][0], ]) # save predicted mel spectrogram to disk (debug) mel_filename = "mel-prediction-step-{}.npy".format(step) np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) # save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) audio.save_wav( wav, os.path.join(wav_dir, "step-{}-wave-from-mel.wav".format(step)), sr=hparams.sample_rate) # save alignment plot to disk (control purposes) plot.plot_alignment( alignment, os.path.join(plot_dir, "step-{}-align.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), max_len=target_length // hparams.outputs_per_step) # save real and predicted mel-spectrogram plot to disk (control purposes) plot.plot_spectrogram( mel_prediction, os.path.join( plot_dir, "step-{}-mel-spectrogram.png".format(step)), title="{}, {}, step={}, loss={:.5f}".format( "Tacotron", time_string(), step, loss), target_spectrogram=target, max_len=target_length) if step % args.embedding_interval == 0 or step == args.tacotron_train_steps or step == 1: # Get current checkpoint state checkpoint_state = tf.train.get_checkpoint_state(save_dir) log("Tacotron training complete after {} global steps!".format( args.tacotron_train_steps), slack=True) return save_dir except Exception as e: log("Exiting due to exception: {}".format(e), slack=True) traceback.print_exc() coord.request_stop(e)
def run_eval_part1(args): speaker_enc_ckpt = args.speaker_encoder_checkpoint syn_ckpt = args.syn_checkpoint speaker_name = args.speaker_name eval_results_dir = os.path.join(args.eval_results_dir, speaker_name) if not os.path.exists(eval_results_dir): os.makedirs(eval_results_dir) speaker_audio_dirs = { "speaker_name": ["speaker_audio_1.wav", "speaker_audio_2.wav"], "vctk_p225": [ "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_001.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_002.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_003.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_004.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p225/p225_005.wav", ], "vctk_p226": [ "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_001.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_002.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_003.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_004.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p226/p226_005.wav", ], "vctk_p227": [ "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_001.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_002.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_003.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_004.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p227/p227_005.wav", ], "vctk_p228": [ "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_001.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_002.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_003.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_004.wav", "/home/zhangwenbo5/lihongfeng/corpus/vctk_dataset/wav16/p228/p228_005.wav", ], "biaobei_speaker": [ "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000001.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000002.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000003.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000004.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000005.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000006.wav", "/home/zhangwenbo5/lihongfeng/corpus/BZNSYP/wavs/000007.wav", ], "aishell_C0002": [ "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0001.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0002.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0003.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0002/IC0002W0004.wav", ], "aishell_C0896": [ "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0001.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0002.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0003.wav", "/home/zhangwenbo5/lihongfeng/corpus/aishell2/data/wav/C0896/IC0896W0004.wav", ], }[speaker_name] sentences = [ "THAT MATTER OF TROY AND ACHILLES WRATH ONE TWO THREE RATS", "ENDED THE QUEST OF THE HOLY GRAAL JERUSALEM A HANDFUL OF ASHES BLOWN BY THE WIND EXTINCT", "She can scoop these things into three red bags", "and we will go meet her Wednesday at the train station", "This was demonstrated in a laboratory experiment with rats." ] sentences = [sen.upper() for sen in sentences] sentences.append( "This was demonstrated in a laboratory experiment with rats") print('eval part1> model: %s.' % syn_ckpt) syner = syn_infer.Synthesizer(syn_ckpt) encoder_infer.load_model(speaker_enc_ckpt) ckpt_step = re.compile(r'.*?\.ckpt\-([0-9]+)').match(syn_ckpt) ckpt_step = "step-" + str(ckpt_step.group(1)) if ckpt_step else syn_ckpt speaker_audio_wav_list = [ encoder_audio.preprocess_wav(wav_dir) for wav_dir in speaker_audio_dirs ] speaker_audio_wav = np.concatenate(speaker_audio_wav_list) print( os.path.join(eval_results_dir, '%s-000_refer_speaker_audio.wav' % speaker_name)) audio.save_wav( speaker_audio_wav, os.path.join(eval_results_dir, '%s-000_refer_speaker_audio.wav' % speaker_name), hparams.sample_rate) speaker_embed = encoder_infer.embed_utterance(speaker_audio_wav) for i, text in enumerate(sentences): path = os.path.join( eval_results_dir, "%s-%s-eval-%03d.wav" % (speaker_name, ckpt_step, i)) print('[{:<10}]: {}'.format('processing', path)) mel_spec = syner.synthesize_spectrograms( [text], [speaker_embed])[0] # batch synthesize print('[{:<10}]:'.format('text:'), text) # print(np.shape(mel_spec)) wav = syner.griffin_lim(mel_spec) audio.save_wav(wav, path, hparams.sample_rate)