def test(model, config, prompts): sr = 24000 if 'blizzard' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = audio.r ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.device('/cpu:0'): batch_inputs = data_input.load_prompts(prompts, ivocab) config.num_prompts = len(prompts) with tf.Session() as sess: stft_mean = tf.get_variable('stft_mean', shape=(1025*audio.r,), dtype=tf.float16) stft_std = tf.get_variable('stft_std', shape=(1025*audio.r,), dtype=tf.float32) # initialize model model = model(config, batch_inputs, train=False) train_writer = tf.summary.FileWriter('log/' + config.save_path + '/test', sess.graph) tf.global_variables_initializer().run() tf.local_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver() print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')] ) saver.restore(sess, latest_ckpt) stft_mean, stft_std = sess.run([stft_mean, stft_std]) try: while(True): out = sess.run([ model.output, model.alignments, batch_inputs ]) outputs, alignments, inputs = out print('saving samples') for out, words, align in zip(outputs, inputs['text'], alignments): # store a sample to listen to text = ''.join([ivocab[w] for w in words]) attention_plot = data_input.generate_attention_plot(align) sample = audio.invert_spectrogram(out*stft_std + stft_mean) merged = sess.run(tf.summary.merge( [tf.summary.audio(text, sample[None, :], sr), tf.summary.image(text, attention_plot)] )) train_writer.add_summary(merged, 0) except tf.errors.OutOfRangeError: coord.request_stop() coord.join(threads)
def test(model, config, prompt_file): sr = 24000 if 'blizzard' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = audio.r ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.device('/cpu:0'): batch_inputs, config.num_prompts = data_input.load_prompts( prompt_file, ivocab) with tf.Session() as sess: stft_mean, stft_std = \ np.load(config.data_path + 'stft_mean.npy'), np.load(config.data_path + 'stft_std.npy') # initialize model model = model(config, batch_inputs, train=False) train_writer = tf.summary.FileWriter( 'log/' + config.save_path + '/test', sess.graph) tf.global_variables_initializer().run() tf.local_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver() print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')]) saver.restore(sess, latest_ckpt) try: while (True): out = sess.run([model.output, model.alignments, batch_inputs]) outputs, alignments, inputs = out print('saving samples') for out, words, align in zip(outputs, inputs['text'], alignments): # store a sample to listen to text = ''.join([ivocab[w] for w in words]) attention_plot = data_input.generate_attention_plot(align) sample = audio.invert_spectrogram(out * stft_std + stft_mean) merged = sess.run( tf.summary.merge([ tf.summary.audio(text, sample[None, :], sr), tf.summary.image(text, attention_plot) ])) train_writer.add_summary(merged, 0) except tf.errors.OutOfRangeError: coord.request_stop() coord.join(threads)
def train(model, config, num_steps=1000000): sr = 24000 if 'vctk' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = meta['r'] ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.Session() as sess: inputs, names, num_speakers, stft_mean, stft_std = \ data_input.load_from_npy(config.data_path) config.num_speakers = num_speakers # save the mean and std as tensorflow variables so they are saved with the weights tf.Variable(stft_mean, name='stft_mean') tf.Variable(stft_std, name='stft_std') batch_inputs = data_input.build_dataset(sess, inputs, names) # initialize model model = model(config, batch_inputs, train=True) train_writer = tf.summary.FileWriter('log/' + config.save_path + '/train', sess.graph) tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3) if config.restore: print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')] ) if RESTORE_FROM is None: if latest_ckpt is not None: saver.restore(sess, latest_ckpt) else: saver.restore(sess, 'weights/' + config.save_path + '-' + str(RESTORE_FROM)) lr = model.config.init_lr annealing_rate = model.config.annealing_rate for _ in tqdm(range(num_steps)): out = sess.run([ model.train_op, model.global_step, model.loss, model.output, model.alignments, model.merged, batch_inputs ], feed_dict={model.lr: lr}) _, global_step, loss, output, alignments, summary, inputs = out train_writer.add_summary(summary, global_step) # detect gradient explosion if loss > 1e8 and global_step > 500: print('loss exploded') break if global_step % 1000 == 0: lr *= annealing_rate if global_step % SAVE_EVERY == 0 and global_step != 0: print('saving weights') if not os.path.exists('weights/' + config.save_path): os.makedirs('weights/' + config.save_path) saver.save(sess, 'weights/' + config.save_path, global_step=global_step) print('saving sample') # store a sample to listen to ideal = audio.invert_spectrogram(inputs['stft'][0]*stft_std + stft_mean) sample = audio.invert_spectrogram(output[0]*stft_std + stft_mean) attention_plot = data_input.generate_attention_plot(alignments[0]) step = '_' + str(global_step) merged = sess.run(tf.summary.merge( [tf.summary.audio('ideal' + step, ideal[None, :], sr), tf.summary.audio('sample' + step, sample[None, :], sr), tf.summary.image('attention' + step, attention_plot)] )) train_writer.add_summary(merged, global_step) coord.request_stop() coord.join(threads)
def train(model, config, num_steps=1000000): sr = 24000 if 'blizzard' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = meta['r'] ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.Session() as sess: inputs, stft_mean, stft_std = data_input.load_from_npy( config.data_path) batch_inputs = data_input.build_dataset(sess, inputs) # initialize model model = model(config, batch_inputs, train=True) train_writer = tf.summary.FileWriter( 'log/' + config.save_path + '/train', sess.graph) tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3) if config.restore: print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')]) if RESTORE_FROM is None: saver.restore(sess, latest_ckpt) else: saver.restore( sess, 'weights/' + config.save_path + '-' + str(RESTORE_FROM)) lr = model.config.init_lr annealing_rate = model.config.annealing_rate for _ in tqdm(range(num_steps)): out = sess.run([ model.train_op, model.global_step, model.loss, model.output, model.alignments, model.merged, batch_inputs ], feed_dict={model.lr: lr}) _, global_step, loss, output, alignments, summary, inputs = out train_writer.add_summary(summary, global_step) # detect gradient explosion if loss > 1e8 and global_step > 500: print('loss exploded') break if global_step % 1000 == 0: lr *= annealing_rate if global_step % SAVE_EVERY == 0 and global_step != 0: print('saving weights') if not os.path.exists('weights/' + config.save_path): os.makedirs('weights/' + config.save_path) saver.save(sess, 'weights/' + config.save_path, global_step=global_step) print('saving sample') # store a sample to listen to ideal = audio.invert_spectrogram(inputs['stft'][0] * stft_std + stft_mean) sample = audio.invert_spectrogram(output[0] * stft_std + stft_mean) attention_plot = data_input.generate_attention_plot( alignments[0]) step = '_' + str(global_step) merged = sess.run( tf.summary.merge([ tf.summary.audio('ideal' + step, ideal[None, :], sr), tf.summary.audio('sample' + step, sample[None, :], sr), tf.summary.image('attention' + step, attention_plot) ])) train_writer.add_summary(merged, global_step) coord.request_stop() coord.join(threads)
tok, lenar = utt2tok(got_utt) print("Generating ...") print('next is --------', 'original') pred_out, alpha_hjk_img, alpha_style_hjk_img, weight_hjk_img = sess.run( [ model.out_stftm, model.final_alpha, model.final_alpha_style, model.final_weight_ta ], feed_dict={ inp: tok, inp_mask: lenar }) pred_out = pred_out * stats["log_stftm_std"] + stats[ "log_stftm_mean"] pred_audio, exp_spec = audio.invert_spectrogram(pred_out, 1.2) siowav.write( os.path.join(generate_wav_path, "%dXoriginal.wav" % (cnt)), sr, pred_audio) for i in range(0, styles_kind): for tim in range(0, 5): print('next is --------', i, tim) unique_style_token = np.copy(trained_style_token) if tim == 0: unique_style_token[0][i] = 0 elif tim == 5: for j in range(0, styles_kind): if j != i: unique_style_token[0][j] = 0 else:
feed_dict={ inp: batch_inp, inp_mask: batch_inp_mask, mel_gtruth: batch_mel_gtruth, spec_gtruth: batch_spec_gtruth }) writer.add_summary(summary_str, global_step_eval) #draw img of weighting #draw_weighting_spec(raw_weighting[0], train_r, pred_out[0], 'im' + str(global_step_eval)) all_pred_out = [] #generate general voice pred_out = pred_out * stats["log_stftm_std"] + stats[ "log_stftm_mean"] pred_audio, exp_spec = audio.invert_spectrogram( pred_out[0], 1.2) pred_audio = np.reshape(pred_audio, (1, pred_audio.shape[-1])) all_pred_out.append(pred_audio) # pred_audio_summary_str = sess.run(train_model.pred_audio_summary, feed_dict={train_model.pred_audio_holder:pred_audio}) # writer.add_summary(pred_audio_summary_str, global_step_eval) # generate unique style voice for i in range(0, styles_kind, 3): for tim in range(1, 4): unique_style_token = np.copy(trained_style_token) unique_style_token[ 0] += tim * trained_style_token[0][i] sess.run(ass_opt, feed_dict={ ass_style_token: unique_style_token })
def train(model, config, num_steps=1000000): sr = 24000 if 'vctk' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = meta['r'] ivocab = meta['vocab'] config.vocab_size = len(ivocab) print("Sampling mean and std...") if args.hdf5: stft_mean, stft_std, mel_mean, mel_std = data_input.get_stft_and_mel_std_and_mean_from_table( os.path.join(config.data_path, "data")) else: stft_mean, stft_std, mel_mean, mel_std = data_input.get_stft_and_mel_std_and_mean_from_tfrecords( config.tf_record_files) print("Sampled mean and std!") print("Building dataset...") loader, reader, names, shapes, types = data_input.build_dataset_with_hdf5_table( os.path.join(config.data_path, "data")) print("Built dataset!") config_proto = tf.ConfigProto() config_proto.gpu_options.allow_growth = True with tf.Session(config=config_proto) as sess: if args.hdf5: batch_inputs, stft_mean, stft_std = data_input.build_hdf5_dataset_from_table( os.path.join(config.data_path, "data"), sess, loader, names, shapes, types, ivocab, stft_mean, stft_std, mel_mean, mel_std) else: batch_inputs = data_input.build_tfrecord_dataset( config.tf_record_files, sess, names, ivocab, stft_mean, stft_std, mel_mean, mel_std) tf.Variable(stft_mean, name='stft_mean') tf.Variable(stft_std, name='stft_std') print("Initializing model...") # initialize model model = model(config, batch_inputs, train=True) print("Model initialized!") train_writer = tf.summary.FileWriter( 'log/' + config.save_path + '/train', sess.graph) tf.global_variables_initializer().run() coord = tf.train.Coordinator() print("Starting queue runners...") threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("Started queue runners!") saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3) if config.restore: print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')]) if RESTORE_FROM is None: if latest_ckpt is not None: saver.restore(sess, latest_ckpt) else: saver.restore( sess, 'weights/' + config.save_path + '-' + str(RESTORE_FROM)) lr = model.config.init_lr annealing_rate = model.config.annealing_rate if config.restore: print("Restored global step: %s" % str(model.global_step.eval(sess))) lr *= (annealing_rate**(model.global_step.eval(sess) // ANNEALING_STEPS)) print("Recovered learning rate: '%s'" % str(lr)) print("Using learning rate: '%s' and annealing rate: '%s'" % (lr, annealing_rate)) print("Looping over num_steps: %s" % str(num_steps)) with loader.begin(sess): for _ in tqdm(range(num_steps)): print("Running sess...") out = sess.run([ model.train_op, model.global_step, model.loss, model.output, model.alignments, model.merged, batch_inputs ], feed_dict={model.lr: lr}) _, global_step, loss, output, alignments, summary, inputs = out print("Finished run: %d!" % global_step) train_writer.add_summary(summary, global_step) # detect gradient explosion if loss > 1e9 and global_step > 50000: print('loss exploded') break if global_step % ANNEALING_STEPS == 0: old_lr = lr lr *= annealing_rate print("Updated learning rate from: %s to %s" % (str(old_lr), str(lr))) if global_step % SAVE_EVERY == 0 and global_step != 0: print('saving weights') if not os.path.exists('weights/' + config.save_path): os.makedirs('weights/' + config.save_path) saver.save(sess, 'weights/' + config.save_path, global_step=global_step) print('saving sample') print("stft shape: %s" % str(inputs['stft'][0].shape)) # store a sample to listen to ideal = audio.invert_spectrogram(inputs['stft'][0] * stft_std + stft_mean) sample = audio.invert_spectrogram(output[0] * stft_std + stft_mean) attention_plot = data_input.generate_attention_plot( alignments[0]) step = '_' + str(global_step) + '_' # Remove pad words text_string = texts = "".join( filter(lambda x: x != "<pad>", [ivocab[word] for word in inputs['text'][0]])) # Remove unicode chars, replacing them with 0 text_string = "".join( map( lambda x: "0" # This is the REGEX specified in name_scope in ops.py in tensorflow if re.match("[A-Za-z0-9_.\\-/ ]", x) is None else x, text_string)) text_string = text_string.strip() quoted_text_string = "\"" + text_string + "\"" print("ideal: %s %s %s" % (str(step), str(ideal[None, :]), str(sr))) print("sample: %s %s %s" % (str(step), str(sample[None, :]), str(sr))) merged = sess.run( tf.summary.merge([ tf.summary.audio('ideal' + step + text_string, ideal[None, :], sr), tf.summary.audio('sample' + step + text_string, sample[None, :], sr), tf.summary.image('attention' + step, attention_plot), tf.summary.text( 'text' + step, tf.convert_to_tensor(quoted_text_string)) ])) train_writer.add_summary(merged, global_step) if global_step % 50 == 0: print("This is reassurance. Global step at: %d" % global_step) coord.request_stop() coord.join(threads) reader.close()
def check_tf_records_with_tensorboard(files, data_path, save_path): sr = 24000 if "vctk" in save_path else 16000 meta = data_input.load_meta(data_path) ivocab = meta["vocab"] with tf.Session() as sess: train_writer = tf.summary.FileWriter('log/' + save_path + '/debug', sess.graph) record_placeholder = tf.placeholder(tf.string) features_in = tf.parse_single_example( record_placeholder, features={ "index": tf.FixedLenFeature([], tf.int64), # "stfts": tf.FixedLenFeature((180, 2050), tf.float32), "stfts": tf.FixedLenFeature((504, 2050), tf.float32), "stfts_shape": tf.FixedLenFeature((2), tf.int64), # "mels": tf.FixedLenFeature((180, 160), tf.float32), "mels": tf.FixedLenFeature((504, 160), tf.float32), "mels_shape": tf.FixedLenFeature((2), tf.int64), "texts": tf.VarLenFeature(tf.int64), "text_lens": tf.FixedLenFeature([], tf.int64), "speech_lens": tf.FixedLenFeature([], tf.int64), }) for file_path in files: print("Reading file: %s" % file_path) for record, i in zip(tf.python_io.tf_record_iterator(file_path), range(count_records([file_path]))): if i % SAVE_EVERY == 0 and (i != 0 or SAVE_EVERY == 1): print("Iteration %d" % i) features = sess.run(features_in, feed_dict={record_placeholder: record}) texts = features["texts"] texts = tf.sparse_to_dense(texts.indices, texts.dense_shape, texts.values) # Debugging THIS script. """ print("texts (numbers): %s" % str(texts.eval(session=sess))) for word in texts.eval(session=sess): try: print("word: %s" % str(ivocab[word])) except: print("invalid word: %s" % str(word)) print("ivocab: %s" % str(ivocab)) """ # Convert integers to words texts = "".join( filter(lambda x: x != "<pad>", [ ivocab[word] for word in texts.eval(session=sess) ])) print("Texts: %s" % texts) texts_filtered = "".join( filter(lambda x: x in set(string.printable), texts)) # print("Texts filtered: %s" % texts_filtered) print("stfts shape: %s" % str(features["stfts"].shape)) print("mels shape: %s" % str(features["mels"].shape)) print("saving sample") # store a sample to listen to ideal = audio.invert_spectrogram(features["stfts"]) step = "_" + str(i) + "_" merged = sess.run( tf.summary.merge([ tf.summary.audio( "ideal" + step + "\"" + texts_filtered + "\"", ideal[None, :], sr), tf.summary.text( "text" + step, tf.convert_to_tensor(texts_filtered)) ])) train_writer.add_summary(merged, i)
def train(model, config, num_steps=1000000): sr = 24000 if 'vctk' in config.data_path else 16000 meta = data_input.load_meta(config.data_path) config.r = meta['r'] ivocab = meta['vocab'] config.vocab_size = len(ivocab) with tf.Session() as sess: # Added comment out """ inputs, names, num_speakers, stft_mean, stft_std = \ data_input.load_from_npy(config.data_path) """ # Added # And THEN commented out """ print("Loading inputs...") inputs, names, num_speakers = data_input.load_from_hdf5(config.data_path) print("Loaded all inputs!") """ # Added comment out """ config.num_speakers = num_speakers """ # Added comment out # save the mean and std as tensorflow variables so they are saved with the weights """ tf.Variable(stft_mean, name='stft_mean') tf.Variable(stft_std, name='stft_std') """ print("Building dataset...") # Added comment out """ batch_inputs = data_input.build_dataset(sess, inputs, names) """ # Added batch_inputs, loader = data_input.build_dataset_with_hdf5( os.path.join(config.data_path, "data")) print("Built dataset!") print("batch_inputs: %s" % str(batch_inputs)) with loader.begin(sess): print("Initializing model...") # initialize model model = model(config, batch_inputs, train=True) print("Model initialized!") train_writer = tf.summary.FileWriter( 'log/' + config.save_path + '/train', sess.graph) tf.global_variables_initializer().run() coord = tf.train.Coordinator() print("Starting queue runners...") threads = tf.train.start_queue_runners(sess=sess, coord=coord) print("Started queue runners!") saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=3) if config.restore: print('restoring weights') latest_ckpt = tf.train.latest_checkpoint( 'weights/' + config.save_path[:config.save_path.rfind('/')]) if RESTORE_FROM is None: if latest_ckpt is not None: saver.restore(sess, latest_ckpt) else: saver.restore( sess, 'weights/' + config.save_path + '-' + str(RESTORE_FROM)) lr = model.config.init_lr annealing_rate = model.config.annealing_rate print("Looping over num_steps: %s" % str(num_steps)) for _ in tqdm(range(num_steps)): print("Running sess...") out = sess.run([ model.train_op, model.global_step, model.loss, model.output, model.alignments, model.merged, batch_inputs ], feed_dict={model.lr: lr}) _, global_step, loss, output, alignments, summary, inputs = out print("Finished run: %d!" % global_step) train_writer.add_summary(summary, global_step) # detect gradient explosion if loss > 1e8 and global_step > 500: print('loss exploded') break if global_step % 1000 == 0: lr *= annealing_rate if global_step % SAVE_EVERY == 0 and global_step != 0: print('saving weights') if not os.path.exists('weights/' + config.save_path): os.makedirs('weights/' + config.save_path) saver.save(sess, 'weights/' + config.save_path, global_step=global_step) print('saving sample') # store a sample to listen to ideal = audio.invert_spectrogram(inputs['stft'][0] * stft_std + stft_mean) sample = audio.invert_spectrogram(output[0] * stft_std + stft_mean) attention_plot = data_input.generate_attention_plot( alignments[0]) step = '_' + str(global_step) merged = sess.run( tf.summary.merge([ tf.summary.audio('ideal' + step, ideal[None, :], sr), tf.summary.audio('sample' + step, sample[None, :], sr), tf.summary.image('attention' + step, attention_plot) ])) train_writer.add_summary(merged, global_step) if global_step % 50 == 0: print("This is reassurance. Global step at: %d" % global_step) coord.request_stop() coord.join(threads)